From a1bfce484097fb3e1638b3c507793f3b8c6270c1 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Fri, 13 Aug 2021 13:24:34 -0700 Subject: [PATCH 01/14] Skip parsing of missing data in metadata Fixes an issue with metadata parsing where pandas treats missing data (e.g., empty strings) as `NaN` values, missing date values (which appear in some GenBank SARS-CoV-2 records) get interpretted as `float` values, and downstream functions that expect strings (like `is_date_ambiguous`) break. This commit resolves the issue by restoring Augur's original behavior which was to fill all missing values in a data frame with empty strings [1]. However, we use a slightly different approach here of asking pandas not to parse missing data at all through the `na_filter=False` argument. This argument has the same effect as the previous implementation, but it only needs to be written once to apply for all calls of `read_metadata`, whereas the `fillna` approach would need to be applied to data frames or data frame chunks in different contexts. To recreate the original issue, this commit updates the functional tests for augur filter to include a metadata record with a missing date column and updates the other parts of the functional tests accordingly. [1] https://github.com/nextstrain/augur/blob/15dfc8b/augur/util_support/metadata_file.py#L97 --- augur/io.py | 1 + tests/functional/filter.t | 8 ++++---- tests/functional/filter/metadata.tsv | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/augur/io.py b/augur/io.py index 51567b4ac..683e57e05 100644 --- a/augur/io.py +++ b/augur/io.py @@ -84,6 +84,7 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None) "sep": None, "engine": "python", "skipinitialspace": True, + "na_filter": False, } if chunk_size: diff --git a/tests/functional/filter.t b/tests/functional/filter.t index c9cbe5285..db7b9ef44 100644 --- a/tests/functional/filter.t +++ b/tests/functional/filter.t @@ -113,10 +113,10 @@ Filter using only metadata without sequence input or output and save results as > --min-length 10500 \ > --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null -Output should include the 8 sequences matching the filters and a header line. +Output should include the 7 sequences matching the filters and a header line. $ wc -l "$TMP/filtered_metadata.tsv" - \s*9 .* (re) + \s*8 .* (re) $ rm -f "$TMP/filtered_metadata.tsv" Filter using only metadata and save results as a list of filtered strains. @@ -128,10 +128,10 @@ Filter using only metadata and save results as a list of filtered strains. > --min-length 10500 \ > --output-strains "$TMP/filtered_strains.txt" > /dev/null -Output should include only the 8 sequences matching the filters (without a header line). +Output should include only the 7 sequences matching the filters (without a header line). $ wc -l "$TMP/filtered_strains.txt" - \s*8 .* (re) + \s*7 .* (re) $ rm -f "$TMP/filtered_strains.txt" Filter using only metadata without a sequence index. diff --git a/tests/functional/filter/metadata.tsv b/tests/functional/filter/metadata.tsv index dc66a193c..40e2cca39 100644 --- a/tests/functional/filter/metadata.tsv +++ b/tests/functional/filter/metadata.tsv @@ -1,5 +1,5 @@ strain virus accession date region country division city db segment authors url title journal paper_url -COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/ +COL/FLR_00024/2015 zika MF574569 South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/ PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215 Phylogeny of Zika Virus in Western Hemisphere, 2015 Emerging Infect. Dis. 22 (5), 933-935 (2016) https://www.ncbi.nlm.nih.gov/pubmed/27088323 COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562 Direct Submission Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA https://www.ncbi.nlm.nih.gov/pubmed/ Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al https://www.ncbi.nlm.nih.gov/nuccore/KY317939 Multiplex PCR method for MinION and Illumina sequencing of Zika and other virus genomes directly from clinical samples Nat Protoc 12 (6), 1261-1276 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538739 From beb93cd82e97582bb80e02631fb06730d20c1214 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Fri, 13 Aug 2021 13:31:00 -0700 Subject: [PATCH 02/14] Skip strains in group-by year when date is missing Fixes a previously hidden bug where grouping by `year` in augur filter would include strains with missing dates as a separate additional group with a missing year value. The original code used a `continue` statement that was intended to continue to the next strain, but because this statement was inside another for loop, it only continued to the new group. This commit fixes the issue by assigning an explicit boolean variable that tracks whether a strain should be skipped or not. We assign this variable to `True` when we can't parse a year from the strain's date string, print a clearer warning message to stderr, and break from the loop (instead of continuing). This commit includes updates to the functional tests (that originally caught this previously hidden bug!) to reflect this new output to stderr and the highest priority strains that should be included from each group. --- augur/filter.py | 9 ++++++--- tests/functional/filter.t | 1 + tests/functional/filter/priorities.tsv | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/augur/filter.py b/augur/filter.py index c2270b20f..631aea792 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -863,6 +863,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): group_by_strain = {} for strain in strains: + skip_strain = False group = [] m = metadata.loc[strain].to_dict() # collect group specifiers @@ -875,8 +876,9 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): try: year = int(m["date"].split('-')[0]) except: - print("WARNING: no valid year, skipping",strain, m["date"]) - continue + print(f"WARNING: no valid year, skipping strain '{strain}' with date value of '{m['date']}'.", file=sys.stderr) + skip_strain = True + break if c=='month': try: month = int(m["date"].split('-')[1]) @@ -888,7 +890,8 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): else: group.append('unknown') - group_by_strain[strain] = tuple(group) + if not skip_strain: + group_by_strain[strain] = tuple(group) # If we could not find any requested categories, we cannot complete subsampling. distinct_groups = set(group_by_strain.values()) diff --git a/tests/functional/filter.t b/tests/functional/filter.t index db7b9ef44..bf5a64996 100644 --- a/tests/functional/filter.t +++ b/tests/functional/filter.t @@ -333,6 +333,7 @@ The two highest priority strains are in these two years. > --priority filter/priorities.tsv \ > --sequences-per-group 1 \ > --output-strains "$TMP/filtered_strains.txt" > /dev/null + WARNING: no valid year, skipping strain 'COL/FLR_00024/2015' with date value of ''. $ diff -u <(sort -k 2,2rn -k 1,1 filter/priorities.tsv | head -n 2 | cut -f 1) <(sort -k 1,1 "$TMP/filtered_strains.txt") $ rm -f "$TMP/filtered_strains.txt" diff --git a/tests/functional/filter/priorities.tsv b/tests/functional/filter/priorities.tsv index 8f960697f..86734afb4 100644 --- a/tests/functional/filter/priorities.tsv +++ b/tests/functional/filter/priorities.tsv @@ -1,5 +1,5 @@ -COL/FLR_00024/2015 100 -PRVABC59 50 +COL/FLR_00024/2015 50 +PRVABC59 100 COL/FLR_00008/2015 10 Colombia/2016/ZC204Se 10 ZKC2/2016 100 From 5db04fc846b890f9b90d41e3e14eb90b197ef4d9 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Fri, 13 Aug 2021 15:00:02 -0700 Subject: [PATCH 03/14] Fix probabilistic subsampling for small values Fixes a regression from the rewrite of augur filter where probabilistic subsampling for small values of `--subsample-max-sequences` could randomly select zero strains and randomly cause our CI tests to fail. Prior to the rewrite of augur filter and introduction of priority queues, we fixed this issue by repeatedly attempting to calculate sequences per group that summed to an integer value greater than zero. However, the way I implemented random queue sizes inside the `PriorityQueue` class in the rewrite prevented me from using a similar "multiple attempts" approach. This commit redesigns the way we create priority queues. In the case where we already know the number of sequences per group in the first pass, we create an appropriately-sized priority queue for each group as we encounter it. There is no possibility that the sum of these queue sizes could be zero. In the case where we need to calculate the number of sequences per group from the first pass, we already know all possible groups and can create their priority queues in bulk. The new `create_queues_by_group` function allows us to create fixed-sized or randomly-sized queues and also make multiple attempts when queue sizes sum to zero. As a result, the `PriorityQueue` class is much simpler (it requires no logic about random max sizes) and we can actually test the fixed and random behaviors more carefully with doctests for `create_queues_by_group`. --- augur/filter.py | 96 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 33 deletions(-) diff --git a/augur/filter.py b/augur/filter.py index 631aea792..a39dbcb68 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -24,7 +24,6 @@ from .utils import is_vcf as filename_is_vcf, read_vcf, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous comment_char = '#' -MAX_NUMBER_OF_PROBABILISTIC_SAMPLING_ATTEMPTS = 10 SEQUENCE_ONLY_FILTERS = ( "min_length", @@ -936,11 +935,6 @@ class PriorityQueue: """A priority queue implementation that automatically replaces lower priority items in the heap with incoming higher priority items. - This implementation also allows the maximum size to be a fractional value - less than 1 in which case the heap size is sampled randomly from a Poisson - distribution with the given maximum size as the mean. This randomly sized - heap enables probabilistic subsampling. - Add a single record to a heap with a maximum of 2 records. >>> queue = PriorityQueue(max_size=2) @@ -974,28 +968,12 @@ class PriorityQueue: >>> list(queue.get_items()) [{'strain': 'strain4'}, {'strain': 'strain3'}] - Assign a fractional maximum size such that the corresponding queue limit is - sampled randomly from a Poisson distribution. For small values, we should - get a max size that is no more than 10 (this is an arbitrarily high number - above what we see for Poisson samples drawn with a mean of 0.1). - - >>> queue = PriorityQueue(max_size=0.1) - >>> queue.max_size in set(range(10)) - True - """ def __init__(self, max_size): - """Create a fixed size heap (priority queue) that allows the maximum size to be - calculated probabilistically from a Poisson process. + """Create a fixed size heap (priority queue) """ - # Fractional heap sizes indicate probabilistic sampling. - if max_size < 1.0: - random_generator = np.random.default_rng() - self.max_size = random_generator.poisson(max_size) - else: - self.max_size = max_size - + self.max_size = max_size self.heap = [] self.counter = itertools.count() @@ -1030,10 +1008,53 @@ def get_items(self): yield item -def priority_queue_factory(max_size): - """Return a callable for a priority queue with the given arguments. +def create_queues_by_group(groups, max_size, max_attempts=100): + """Create a dictionary of priority queues per group for the given maximum size. + + When the maximum size is fractional, probabilistically sample the maximum + size from a Poisson distribution. Make at least the given number of maximum + attempts to create queues for which the sum of their maximum sizes is + greater than zero. + + Create queues for two groups with a fixed maximum size. + + >>> groups = ("2015", "2016") + >>> queues = create_queues_by_group(groups, 2) + >>> sum(queue.max_size for queue in queues.values()) + 4 + + Create queues for two groups with a fractional maximum size. Their total max + size should still be an integer value greater than zero. + + >>> queues = create_queues_by_group(groups, 0.1) + >>> int(sum(queue.max_size for queue in queues.values())) > 0 + True + """ - return lambda: PriorityQueue(max_size=max_size) + queues_by_group = {} + total_max_size = 0 + attempts = 0 + + if max_size < 1.0: + random_generator = np.random.default_rng() + + # For small fractional maximum sizes, it is possible to randomly select + # maximum queue sizes that all equal zero. When this happens, filtering + # fails unexpectedly. We make multiple attempts to create queues with + # maximum sizes greater than zero for at least one queue. + while total_max_size == 0 and attempts < max_attempts: + for group in groups: + if max_size < 1.0: + queue_max_size = random_generator.poisson(max_size) + else: + queue_max_size = max_size + + queues_by_group[group] = PriorityQueue(queue_max_size) + + total_max_size = sum(queue.max_size for queue in queues_by_group.values()) + attempts += 1 + + return queues_by_group def register_arguments(parser): @@ -1347,11 +1368,17 @@ def run(args): # Track the highest priority records, when we already # know the number of sequences allowed per group. if queues_by_group is None: - queues_by_group = defaultdict(priority_queue_factory( - max_size=sequences_per_group, - )) + queues_by_group = {} for strain, group in group_by_strain.items(): + # During this first pass, we do not know all possible + # groups will be, so we need to build each group's queue + # as we first encounter the group. + if group not in queues_by_group: + queues_by_group[group] = PriorityQueue( + max_size=sequences_per_group, + ) + queues_by_group[group].add( metadata.loc[strain], priorities[strain], @@ -1411,9 +1438,12 @@ def run(args): sys.exit(1) if queues_by_group is None: - queues_by_group = defaultdict(priority_queue_factory( - max_size=sequences_per_group, - )) + # We know all of the possible groups now from the first pass through + # the metadata, so we can create queues for all groups at once. + queues_by_group = create_queues_by_group( + records_per_group.keys(), + sequences_per_group, + ) # Make a second pass through the metadata, only considering records that # have passed filters. From a30b1a9befd85a8d23ac43348aa57cb01d8396d9 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Fri, 13 Aug 2021 16:20:33 -0700 Subject: [PATCH 04/14] Update for patch release --- CHANGES.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 066d7be54..5b5993e52 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,13 @@ ## __NEXT__ +### Bug Fixes + +* filter: Fix parsing of missing data in metadata [#758][] +* filter: Fix probabilistic sampling with small values [#759][] + +[#758]: https://github.com/nextstrain/augur/pull/758 +[#759]: https://github.com/nextstrain/augur/pull/759 ## 12.1.0 (12 August 2021) From 5c89570b456b29bc70377592784f6828eaef8840 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Fri, 13 Aug 2021 16:24:41 -0700 Subject: [PATCH 05/14] version 12.1.1 --- CHANGES.md | 3 +++ augur/__version__.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 5b5993e52..a62b0ad30 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,9 @@ ## __NEXT__ + +## 12.1.1 (13 August 2021) + ### Bug Fixes * filter: Fix parsing of missing data in metadata [#758][] diff --git a/augur/__version__.py b/augur/__version__.py index 70b686f24..8536b264e 100644 --- a/augur/__version__.py +++ b/augur/__version__.py @@ -1,4 +1,4 @@ -__version__ = '12.1.0' +__version__ = '12.1.1' def is_augur_version_compatible(version): From 6b13599863ff42bb92482b5eb68eb31d86b8f546 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 16 Aug 2021 12:21:33 -0700 Subject: [PATCH 06/14] Test grouping ambiguous dates by month Adds a functional test for grouping ambiguous dates by year and month and corresponding metadata, sequence, and sequence index entry for a strain with only a year in the date field. The test fails because the ambiguous month gets generated twice for the same strain during the augur filter command and the group from the first pass through the metadata doesn't exist in the second pass. Confirms #760 --- augur/io.py | 2 +- tests/functional/filter.t | 28 +++- tests/functional/filter/metadata.tsv | 1 + tests/functional/filter/sequence_index.tsv | 1 + tests/functional/filter/sequences.fasta | 179 +++++++++++++++++++++ 5 files changed, 204 insertions(+), 7 deletions(-) diff --git a/augur/io.py b/augur/io.py index 683e57e05..92514109d 100644 --- a/augur/io.py +++ b/augur/io.py @@ -77,7 +77,7 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None) ... (5, 14) (5, 14) - (1, 14) + (2, 14) """ kwargs = { diff --git a/tests/functional/filter.t b/tests/functional/filter.t index bf5a64996..8fcfe64b7 100644 --- a/tests/functional/filter.t +++ b/tests/functional/filter.t @@ -9,14 +9,14 @@ Force include one South American record by country to get two total records. $ ${AUGUR} filter \ > --metadata filter/metadata.tsv \ - > --exclude-where "region=South America" "region=North America" \ + > --exclude-where "region=South America" "region=North America" "region=Southeast Asia" \ > --include-where "country=Ecuador" \ > --output-strains "$TMP/filtered_strains.txt" > /dev/null $ wc -l "$TMP/filtered_strains.txt" \s*2 .* (re) $ rm -f "$TMP/filtered_strains.txt" -Filter with subsampling, requesting 1 sequence per group (for a group with 3 distinct values). +Filter with subsampling, requesting 1 sequence per group (for a group with 4 distinct values). $ ${AUGUR} filter \ > --metadata filter/metadata.tsv \ @@ -24,7 +24,7 @@ Filter with subsampling, requesting 1 sequence per group (for a group with 3 dis > --sequences-per-group 1 \ > --output-strains "$TMP/filtered_strains.txt" > /dev/null $ wc -l "$TMP/filtered_strains.txt" - \s*3 .* (re) + \s*4 .* (re) $ rm -f "$TMP/filtered_strains.txt" Filter with subsampling, requesting no more than 8 sequences. @@ -225,7 +225,7 @@ Metadata should have the same number of records as the sequences plus a header. \s*5 .* (re) $ rm -f "$TMP/filtered.tsv" -Alternately, exclude only the sequences from Brazil and Colombia (12 - 4 strains). +Alternately, exclude the sequences from Brazil and Colombia (N=4) and records without sequences (N=1) or metadata (N=1). $ ${AUGUR} filter \ > --sequences filter/sequences.fasta \ @@ -234,7 +234,7 @@ Alternately, exclude only the sequences from Brazil and Colombia (12 - 4 strains > --exclude "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \ > --output "$TMP/filtered.fasta" > /dev/null $ grep "^>" "$TMP/filtered.fasta" | wc -l - \s*6 (re) + \s*7 (re) $ rm -f "$TMP/filtered.fasta" Try to filter with sequences that don't match any of the metadata. @@ -318,7 +318,7 @@ The query initially filters 3 strains from Colombia, one of which is added back \t1 had no sequence data (esc) \t3 of these were filtered out by the query: "country != 'Colombia'" (esc) \t1 strains were added back because they were in filter/include.txt (esc) - 8 strains passed all filters + 9 strains passed all filters $ diff -u <(sort -k 1,1 filter/filtered_log.tsv) <(sort -k 1,1 "$TMP/filtered_log.tsv") $ rm -f "$TMP/filtered_strains.txt" @@ -337,3 +337,19 @@ The two highest priority strains are in these two years. $ diff -u <(sort -k 2,2rn -k 1,1 filter/priorities.tsv | head -n 2 | cut -f 1) <(sort -k 1,1 "$TMP/filtered_strains.txt") $ rm -f "$TMP/filtered_strains.txt" + +Try to subsample a maximum number of sequences by year and month, given metadata with ambiguous year and month values. +Strains with ambiguous years or months should be dropped and logged. + + $ ${AUGUR} filter \ + > --metadata filter/metadata.tsv \ + > --group-by year month \ + > --subsample-max-sequences 5 \ + > --output-strains "$TMP/filtered_strains.txt" \ + > --output-log "$TMP/filtered_log.tsv" > /dev/null + $ grep "SG_018" "$TMP/filtered_log.tsv" | cut -f 1-2 + SG_018\tskip_group_by_with_ambiguous_month (esc) + SG_018\tsubsampling (esc) + $ grep "COL/FLR_00024/2015" "$TMP/filtered_log.tsv" | cut -f 1-2 + COL/FLR_00024/2015\tskip_group_by_with_ambiguous_year (esc) + COL/FLR_00024/2015\tsubsampling (esc) diff --git a/tests/functional/filter/metadata.tsv b/tests/functional/filter/metadata.tsv index 40e2cca39..3de583bf8 100644 --- a/tests/functional/filter/metadata.tsv +++ b/tests/functional/filter/metadata.tsv @@ -10,3 +10,4 @@ BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil gen DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785420 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734 EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al https://www.ncbi.nlm.nih.gov/nuccore/KX879603 First Complete Genome Sequences of Zika Virus Isolated from Febrile Patient Sera in Ecuador Genome Announc 5 (8), e01673-16 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28232448 HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785418 Zika virus evolution and spread in the Americas Nature 546 (7658), 411-415 (2017) https://www.ncbi.nlm.nih.gov/pubmed/28538734 +SG_018 zika KY241688 2016 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241688 Outbreak of Zika virus infection in Singapore: an epidemiological, entomological, virological, and clinical analysis Lancet Infect Dis (2017) In press https://www.ncbi.nlm.nih.gov/pubmed/ diff --git a/tests/functional/filter/sequence_index.tsv b/tests/functional/filter/sequence_index.tsv index 16dafa0bd..4a034c607 100644 --- a/tests/functional/filter/sequence_index.tsv +++ b/tests/functional/filter/sequence_index.tsv @@ -10,3 +10,4 @@ DOM/2016/BB_0059 10035 2563 2089 2741 2015 621 6 0 0 0 BRA/2016/FC_6706 10366 2747 2203 2915 2165 329 7 0 0 0 DOM/2016/BB_0183 10621 2910 2343 3099 2269 0 0 0 0 0 EcEs062_16 10812 2960 2388 3158 2306 0 0 0 0 0 +SG_018 10659 2906 2367 3110 2257 19 0 0 0 0 diff --git a/tests/functional/filter/sequences.fasta b/tests/functional/filter/sequences.fasta index 8149494fc..f306f7474 100644 --- a/tests/functional/filter/sequences.fasta +++ b/tests/functional/filter/sequences.fasta @@ -1962,3 +1962,182 @@ ggcctgaactggagatcagctgtggatctccagaagagggactagtggttagaggagacc ccccggaaaacgcaaaacagcatattgacgctgggaaagaccagagactccatgagtttc caccacgctggccgccaggcacagatcgccgaatagcggcggccggtgtggggaaatcca tgggagatcgga +>SG_018 +atgnnnnnnnnnnnnnnnnnnnccggaggattccggattgtcaatatgctaaaacgcgga +gtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggacttctgctgggt +catgggcccatcaggatggtcttggcgattctagcctttttgaggttcacggcaatcaag +ccatcactgggtctcatcaatagatggggttcagtggggaaaaaagaggctatggaaata +ataaagaagttcaagaaagatctggctgccatgctgagaataatcaatgctaggaaggag +aagaagagacgaggcgcagatactagtgtcggaattgttggcctcctgctgaccacagct +atggcagcggaggtcactagacgtgggagtgcatactatatgtacttggacagaagcgat +gctggggaggccatatcttttccaaccacactggggatgaataagtgttatatacagatc +atggatcttggacacatgtgtgatgccaccatgagctatgaatgccctatgctggatgag +ggggtagaaccagatgacgtcgattgttggtgcaacacgacgtcagcttgggttgtgtac +gtaacctgccatcacaaaaaaggtgaagcacggagatctagaagagctgtgacgctcccc +tcccattccactaggaagctgcaaacgcggtcgcagacctggttggagtcaagagaatac +acaaagcacttgattagagtcgaaaattggatattcaggaaccctggcttcgcgttagca +gcagctgccatcgcttggcttttgggaagctcaacgagccaaaaagtcatatacttggtc +atgatactgctgattgccccggcatacagcatcaggtgcataggagtcagcaatagggac +tttgtggaaggtatgtcaggtgggacttgggttgatgttgtcttggaacatggaggttgt +gtcaccgtaatggcacaggacaaaccgactgtcgacatagagctggttacaacaacagtc +agcaacatggcggaggtaagatcctactgctatgaggcatcaatatcggacatggcttcg +gacagccgctgcccaacacaaggtgaagcctaccttgacaagcaatcagatacccaatat +gtctgcaaaagaacgttagtggacagaggctggggaaatggatgtggactttttggaaaa +gggagcctggtgacatgcgctaagtttgcatgctccaagaaaatgaccgggaagagcatc +cagccagagaatctggagtaccggataatgctgtcagtccatggctcccagcacagtggg +atgatcgttaatgacacaggacatgaaactgatgagaatagagcgaaggttgagataacg +cccaattcaccaagagccgaagccaccctggggggttttggaagcctaggacttgattgt +gaaccgaggacaggccttgacttttcagatttgtattacttgactatgaataacaagcac +tggttggttcacaaggagtggttccacgacattccattaccttggcacgctggggcagac +accggaactccacactggaacaacaaagaagcactggtagagttcaaggacgcacatgcc +aaaaggcaaactgtcgtggttctagggagtcaagaaggagcagttcacacggcccttgct +ggagctctggaggctgagatggatggtgcaaagggaaggctgtcctctggccacttgaaa +tgtcgcctgaaaatggacaaacttagattgaagggcgtgtcatactccttgtgtaccgca +gcgttcacattcaccaagatcccggctgaaacactgcacgggacagtcacagtggaggta +cagtacgcagggacagatggaccctgcaaggttccagctcagatggcggtggacatgcaa +actctgaccccagttgggaggttgataaccgctaaccccgtaatcactgaaagcactgag +aactctaagatgatgctggaacttgatccaccatttggggactcttacattgtcatagga +gtcggggagaagaagatcacccaccactggcacaggagtggcagtaccattggaaaagca +tttgaagccactgtgagaggtgccaagagaatggcagtcttgggagacacagcctgggac +tttggatcagttggaggcgctctcaactcattgggcaagggcatccatcaaatttttgga +gcagctttcaaatcattgtttggaggaatgtcctggttctcacaaattctcattggaacg +ttgctgatgtggttgggcctgaacacaaagaatggatctatttcccttatgtgcttggcc +ttagggggagtgttgatcttcttatccacagccgtctctgctgatgtggggtgctcggtg +gacttctcaaagaaggaaacgagatgcggtacaggggtgttcgtctataacgacgttgaa +gcctggagggacaggtacaagtaccatcctgactccccccgtagattggcagcagcagtc +aagcaagcctgggaagatggtatctgtgggatctcctctgtttcaagaatggaaaacatc +atgtggagatcagtagaaggggagctcaacgcaatcctggaagaaaatggagttcaactg +acggtcgttgtgggatctgtaaaaaaccccatgtggagaggtccacagagactgcccgtg +cctgtgaacgagctgccccacggctggaaggcttgggggaaatcgtacttcgtcagagca +gcaaagacaaataacagctttgtcgtggatggtgacacactgaaggaatgcccactcaaa +catagagcatggaacagctttcttgtggaggatcatgggttcggggtatttcacactagt +gtctggctcaaggttagagaagattattcattagagtgtgatcctgccgttattggaaca +gctgttaagggaaaggaggctgcacacagtgatctaggctactggattgagagtgagaag +aatgacacatggaggctgaagagggcccacctgatcgagatgaaaacatgtgaatggcca +aagtcccacacattgtggacagatggaatagaagagagtgatctgatcatacccaagtct +ttagctgggccactcagccatcacaacaccagagagggctacaggacccaaatgaaaggg +ccatggcacagtgaagagcttgaaattcggtttgaggaatgcccaggcactaaggtccac +gtggaggaaacatgtggaacaagaggaccgtctctgagatcaaccactgcaagcggaagg +gtgatcgaggaatggtgctgcagggagtgcacaatgcccccactgtcgttccgggctaaa +gatggctgttggtatggaatggagataaggcccaggaaagaaccagaaagtaacttagta +aggtcaatggtgactgcaggatcaactgatcacatggatcacttctcccttggagtgctt +gtgattctgctcatggtgcaggaagggctgaagaagagaatgaccacaaagatcatcata +agcacatcaatggcagtgctggtagctatgatcctgggaggattttcaatgagtgacctg +gctaagcttgcaattttgatgggtgccaccttcgcggaaatgaacactggaggagatgta +gctcatctggcgctgatagcggcattcaaagtcagaccagcgttgctggtatctttcatc +ttcagagctaattggacaccccgtgaaagcatgctgctggccttggcctcgtgtcttttg +caaactgcgatttccgccttggaaggcgacctgatggttctcatcaatggttttgctctg +gcctggttggcaatacgagcgatggttgttccacgcactgacaacatcaccttggcaatc +ctggctgctctgacaccactggcccggggcacactgcttgtggcgtggagagcaggcctt +gctacttgcggggggttcatgctcctctctctgaagggaaaaggcagtgtgaagaagaac +ttaccatttgtcatggccctgggactaaccgctgtgaggctagtcgaccccatcaacgtg +gtgggactgctgttgctcacaaggagtgggaagcggagctggccccctagcgaagtactc +acagctgttggcctgatatgcgcattggctggagggttcgccaaggcagatatagagatg +gctgggcccatggccgcggttggtctgctaattgtcagttacgtggtctcaggaaagagt +gtggacatgtacattgaaagagcaggtgacatcacatgggaaaaagatgcggaagtcact +ggaaacagtccccggctcgatgtggcactagatgagagtggtgatttctccctggtggag +gatgacggtccccccatgagagagatcatactcaaagtggtcctgatgaccatctgtggc +atgaacccaatagccataccctttgcagctggagcgtggtacgtatacgtgaagactgga +aaaaggagtggtgctctatgggatgtgcctgctcccaaggaagtaaaaaagggggagacc +acagatggagtgtacagagtaatgactcgtagactgctaggttcaacacaagttggagtg +ggagttatgcaagagggggtctttcacactatgtggcacgtcacaaaaggatccgcactg +agaagcggtgaagggagacttgatccatactggggagatgtcaagcaggatctggtgtca +tactgtggtccatggaagctagatgccgcctgggacgggcacagcgaggtgcagctcttg +gccgtgccccccggagagagagcgaggaacatccagactctgcccggaatatttaagaca +aaggatggggacattggagcggttgcgctggactacccagcaggaacttcaggatctcca +atcctagacaagtgtgggagagtgataggactctatggcaatggggtcgtgatcaaaaat +gggagttatgttagtgccatcacccaagggaggagggaggaagagactcctgttgagtgc +ttcgagccttcgatgctgaagaagaagcagctaactgtcttagacttgcatcctggagct +gggaaaaccaggagagttcttcctgaaatagtccgtgaagccataaaaacaagactccgt +actgtgatcttagctccaactagggttgtcgctgctgaaatggaggaagcccttagaggg +cttccagtgcgttatatgacaacagcagtcaatgtcacccactctgggacagaaatcgtt +gacttaatgtgccatgccaccttcacttcacgtctactacagccaatcagagtccccaac +tataatctgtatattatggatgaggcccacttcacagatccctcaagtatagcagcaaga +ggatacatttcaacaagggttgagatgggcgaggcggctgccatcttcatgaccgccacg +ccaccaggaacccgtgacgcatttccggactccaactcaccaattatggacaccgaagtg +gaagtcccggagagagcctggagctcaggctttgattgggtgacggaccattctggaaaa +acagtttggtttgttccaagcgtgaggaacggcaatgagatcgcagcttgtctgacgaag +gctggaaaacgggtcatacagctcagcagaaagacttttgagacagagttccagaaaaca +aaacatcaagagtgggactttgtcgtgacaactgacatttcagagatgggcgccaacttt +aaagctgaccgtgtcatagattccaggagatgcctaaagccggtcatacttgatggcgag +agagtcattctggctggacccatgcctgtcacacatgccagcgctgcccagaggaggggg +cgcataggcaggaatcccaacaaacctggagatgagtatctgtatggaggtgggtgcgca +gagactgatgaagaccatgcacactggcttgaagcaagaatgctccttgacaatatctac +ctccaagatggcctcatagcctcgctctatcgacctgaggccgacaaagtagcagccatt +gagggagagttcaagcttaggacggagcaaaggaagacctttgtggaactcatgaaaaga +ggagatcttcctgtttggctggcctatcaggttgcatctgccggaataacctacacagat +agaagatggtgctttgatggcacgaccaacaacaccataatggaagacagtgtgccggca +gaggtgtggaccagatacggagagaaaagagtgctcaaaccgaggtggatggacgccaga +gtttgttcagatcatgcggccctgaagtcattcaaggagtttgccgctgggaaaagagga +gcggcttttggagtgatggaagccctgggaacactgccaggacacatgacagagagattc +caggaagccattgacaacctcgctgtgctcatgcgggcagagactggaagcagaccttac +aaagccgcggcggcccaattgccggagaccctagagaccattatgcttttggggttgctg +ggaacagtctcgctgggaatctttttcgtcttgatgcggaacaagggcatagggaagatg +ggctttggaatggtgactcttggggccagcgcatggctcatgtggctctcggaaattgag +ccagccagaattgcatgtgtcctcattgttgtgttcctattgctggtggtgctcatacct +gagccagaaaagcaaagatctccccaggacaaccaaatggcaatcatcatcatggtagca +gtaggtcttctgggcttgattaccgccaatgaactcggatggttggagagaacaaagagt +gacctaagccatctaatgggaaggagagaggagggggcaaccataggattctcaatggac +attgacctgcggccagcctcagcttgggccatctatgctgccctgacaactttcattacc +ccagccgtccaacatgcagtgaccacttcatacaacaactactccttaatggcgatggcc +acgcaagctggagtgttgtttggtatgggcaaagggatgccattctacgcatgggacttt +ggagtcccgctgctaatgataggttgctactcacaattaacacccctgaccctaatagtg +gccatcattttgctcgtggcgcactacatgtacttgatcccagggctgcaggcagcagct +gcgcgtgctgcccagaagagaacggcagctggcatcatgaagaaccctgttgtggatgga +atagtggtgactgacattgacacaatgacaattgacccccaagtggagaaaaagatggga +caggtgctactcatagcagtagccgtctccagcgccatactgtcgcggaccgcctggggg +tggggggaggctggggccctgatcacagctgcaacttccactttgtgggaaggctctccg +aacaagtactggaactcctctacagccacttcactgtgtaacatttttaggggaagttac +ttggctggagcttctctaatctacacagtaacaagaaacgctggtttggtcaagagacgt +gggggtggaacaggagagaccctgggagagaaatggaaggcccgcttgaaccagatgtcg +gccctggagttctactcctacaaaaagtcaggcatcaccgaggtgtgcagagaagaggcc +cgccgcgccctcaaggacggtgtggcaacgggaggccatgctgtgtcccggggaagtgca +aagctgagatggttggtggagcggggatacctgcagccctatggaaaggtcattgatctt +ggatgtggcagagggggctggagttactacgccgccaccatccgcaaagttcaagaagtg +aaaggatacacaaaaggaggccctggtcatgaagaacccatgttggtgcaaagctatggg +tggaacatagtccgtcttaagagtggggtggacgtctttcatatggcggctgagccgtgt +gacacgttgctgtgtgacataggtgagtcatcatctagtcctgaagtggaagaagcacgg +acgctcagagtcctctccatggtgggggattggcttgaaaaaagaccaggagccttttgt +ataaaagtgttgtgcccatacaccagcactatgatggaaaccctggagcgactgcagcgt +aggtatgggggaggactggtcagagtgccactctcccgcaactcaacacatgagatgtac +tgggtctctggagcgaaaagcaacaccataaaaagtgtgtccaccacgagccagctcctc +ttggggcgcatggacgggcccaggaggccagtgaaatatgaggaggatgtgaatctcggc +tctggcacgcgggctgtggtaagctgcgctgaagctcccaacatgaagatcattggtaac +cgcattgaaaggatccgcagtgagcacgcggaaacgtggttctttgacgagaaccaccca +tataggacatgggcttaccatggaagctatgaggcccccacacaagggtcagcgtcctct +ctaataaacggggttgtcaggctcctgtcaaaaccctgggatgtggtgactggagtcaca +ggaatagccatgaccgacaccacaccgtatggtcagcaaagagttttcaaggaaaaagtg +gacactagggtgccagacccccaagaaggcactcgtcaggttatgagcatggtctcttcc +tggttgtggaaagagctaggcaaacacaaacggccacgagtctgtaccaaagaagagttc +atcaacaaggttcgtagcaatgcagcattaggggcaatatttgaagaggaaaaagagtgg +aagactgcagtggaagctgtgaacgatccaaggttctgggctctagtggacaaggaaaga +gagcaccacctgagaggagagtgccagagctgtgtgtacaacatgatgggaaaaagagaa +aagaaacaaggggaatttggaaaggccaagggcagccgcgccatctggtatatgtggcta +ggggctagatttctagagttcgaagcccttggattcttgaacgaggatcactggatgggg +agagagaactcaggaggtggtgttgaagggctgggattacaaagactcggatatgtccta +gaagagatgagtcgcataccaggaggaaggatgtatgcagatgacactgctggctgggac +acccgcatcagcaggtttgatctggagaatgaagctctaatcaccaaccaaatggagaaa +gggcacagggccttggcattggccataatcaagtacacataccaaaacaaagtggtaaag +gtccttagaccagctgaaaaagggaagacagtgatggacattatttcaagacaagaccaa +agggggagcggacaagttgtcacttacgctctcaacacatttaccaacctagtggtgcaa +ctcattcggaatatggaggctgaggaagttctagagatgcaagacttgtggctgctgcgg +aggtcagagaaagtgaccaactggttgcagagcaacggatgggataggctcaaacgaatg +gcagtcagtggagatgattgcgttgtgaagccaattgatgataggtttgcacatgccctc +aggttcttgaatgatatggggaaagttaggaaggacacacaagaatggaaaccctcaact +ggatgggacaactgggaagaagttccgttttgctcccaccatttcaacaagctccatctc +aaggacgggaggtccattgtggttccctgccgccaccaagatgaactgattggccgggcc +cgcgtctctccaggggcgggatggagcatccgggagactgcttgcctagcaaaatcatat +gcgcagatgtggcagctcctttatttccacagaagggacctccgactgatggccaatgcc +atttgttcatctgtgccagttgactgggttccaactgggagaactacctggtcaatccat +ggaaagggagaatggatgaccactgaagacatgcttgtggtgtggaacagagtgtggatt +gaggagaacgaccacatggaagacaagaccccagttacgaaatggacagacattccctat +ttgggaaaaagggaagacttatggtgtggatctctcatagggcacagaccgcgcaccacc +tgggctgagaacattaaaaacacagtcaacatggtgcgcagaatcataggtgatgaagaa +aagtacatggactacctatccacccaagttcgctacttgggtgaggaagggtctacacct +ggagtgctgtaagcaccaatcttagtgttgtcaggcctgctagtcagccacagcttgggg +aaagctgtgcagcctgtgacccccccaggagaagctgggaaaccaagcccatagtcaggc +cgagaacgccatggcacggaagaagccatgctgcctgtgagcccctcagaggacactgag +tcaaaaaaccccacgcgcttggaggcgcaggatgggaaaagaaggtggcgaccttcccca +cccttcaatctggggcctgaactggagatcagctgtggatctccagaagagggactagtg +gttagaggagaccccccggaaaacgcaaaacagcatattgacgctgggaaagaccagaga +ctccatgagtttccaccacgctggccgccaggcacagat From 54051140e5510bd64fdd70d44c711edef388dd7a Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 16 Aug 2021 13:09:09 -0700 Subject: [PATCH 07/14] Track strains skipped during grouping Updates the interface to the `get_groups_for_subsampling` function such that it returns a list of strains that have been skipped and the reason why (following the pattern of filtered and force-included strains lists from the `apply_filters` function). We count the number of strains skipped for different reasons, report this number in the final output, and also optionally log these strains to the output log table. This change allows users to track why specific strains were excluded from an analysis after the fact without needing to refer to the logs for stdout or stderr. --- augur/filter.py | 66 +++++++++++++++++++++++++++++++++------ tests/functional/filter.t | 1 - 2 files changed, 57 insertions(+), 10 deletions(-) diff --git a/augur/filter.py b/augur/filter.py index a39dbcb68..2e57dabf7 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -820,23 +820,30 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): ------- dict : A mapping of strain names to tuples corresponding to the values of the strain's group. + list : + A list of dictionaries with strains that were skipped from grouping and the reason why (see also: `apply_filters` output). >>> strains = ["strain1", "strain2"] >>> metadata = pd.DataFrame([{"strain": "strain1", "date": "2020-01-01", "region": "Africa"}, {"strain": "strain2", "date": "2020-02-01", "region": "Europe"}]).set_index("strain") >>> group_by = ["region"] - >>> get_groups_for_subsampling(strains, metadata, group_by) + >>> group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, group_by) + >>> group_by_strain {'strain1': ('Africa',), 'strain2': ('Europe',)} + >>> skipped_strains + [] If we group by year or month, these groups are calculated from the date string. >>> group_by = ["year", "month"] - >>> get_groups_for_subsampling(strains, metadata, group_by) + >>> group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, group_by) + >>> group_by_strain {'strain1': (2020, (2020, 1)), 'strain2': (2020, (2020, 2))} If we omit the grouping columns, the result will group by a dummy column. - >>> get_groups_for_subsampling(strains, metadata) + >>> group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata) + >>> group_by_strain {'strain1': ('_dummy',), 'strain2': ('_dummy',)} If we try to group by columns that don't exist, we get an error. @@ -851,9 +858,31 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): grouping to continue and print a warning message to stderr. >>> group_by = ["year", "month", "missing_column"] - >>> get_groups_for_subsampling(strains, metadata, group_by) + >>> group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, group_by) + >>> group_by_strain {'strain1': (2020, (2020, 1), 'unknown'), 'strain2': (2020, (2020, 2), 'unknown')} + If we group by year month and some records don't have that information in + their date fields, we should skip those records from the group output and + track which records were skipped for which reasons. + + >>> metadata = pd.DataFrame([{"strain": "strain1", "date": "", "region": "Africa"}, {"strain": "strain2", "date": "2020-02-01", "region": "Europe"}]).set_index("strain") + >>> group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, ["year"]) + >>> group_by_strain + {'strain2': (2020,)} + >>> skipped_strains + [{'strain': 'strain1', 'filter': 'skip_group_by_with_ambiguous_year', 'kwargs': ''}] + + Similarly, if we group by month, we should skip records that don't have + month information in their date fields. + + >>> metadata = pd.DataFrame([{"strain": "strain1", "date": "2020", "region": "Africa"}, {"strain": "strain2", "date": "2020-02-01", "region": "Europe"}]).set_index("strain") + >>> group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, ["month"]) + >>> group_by_strain + {'strain2': ((2020, 2),)} + >>> skipped_strains + [{'strain': 'strain1', 'filter': 'skip_group_by_with_ambiguous_month', 'kwargs': ''}] + """ if group_by: groups = group_by @@ -861,6 +890,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): groups = ("_dummy",) group_by_strain = {} + skipped_strains = [] for strain in strains: skip_strain = False group = [] @@ -875,7 +905,11 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): try: year = int(m["date"].split('-')[0]) except: - print(f"WARNING: no valid year, skipping strain '{strain}' with date value of '{m['date']}'.", file=sys.stderr) + skipped_strains.append({ + "strain": strain, + "filter": "skip_group_by_with_ambiguous_year", + "kwargs": "", + }) skip_strain = True break if c=='month': @@ -928,7 +962,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): file=sys.stderr, ) - return group_by_strain + return group_by_strain, skipped_strains class PriorityQueue: @@ -1352,12 +1386,20 @@ def run(args): # count the number of records per group. First, we need to get # the groups for the given records. try: - group_by_strain = get_groups_for_subsampling( + group_by_strain, skipped_strains = get_groups_for_subsampling( seq_keep, metadata, group_by, ) + # Track strains skipped during grouping, so users know why those + # strains were excluded from the analysis. + for skipped_strain in skipped_strains: + filter_counts[(skipped_strain["filter"], skipped_strain["kwargs"])] += 1 + + if args.output_log: + output_log_writer.writerow(skipped_strain) + if args.subsample_max_sequences and records_per_group is not None: # Count the number of records per group. We will use this # information to calculate the number of sequences per group @@ -1458,7 +1500,7 @@ def run(args): # during the first pass, but we want to minimize overall memory # usage at the moment. seq_keep = set(metadata.index.values) & valid_strains - group_by_strain = get_groups_for_subsampling( + group_by_strain, skipped_strains = get_groups_for_subsampling( seq_keep, metadata, group_by, @@ -1594,11 +1636,17 @@ def run(args): "filter_by_date": "{count} of these were dropped because of their date (or lack of date)", "filter_by_sequence_length": "{count} of these were dropped because they were shorter than minimum length of {min_length}bp", "filter_by_non_nucleotide": "{count} of these were dropped because they had non-nucleotide characters", + "skip_group_by_with_ambiguous_year": "{count} were dropped during grouping due to ambiguous year information", + "skip_group_by_with_ambiguous_month": "{count} were dropped during grouping due to ambiguous month information", "include": "{count} strains were added back because they were in {include_file}", "include_by_include_where": "{count} sequences were added back because of '{include_where}'", } for (filter_name, filter_kwargs), count in filter_counts.items(): - parameters = dict(json.loads(filter_kwargs)) + if filter_kwargs: + parameters = dict(json.loads(filter_kwargs)) + else: + parameters = {} + parameters["count"] = count print("\t" + report_template_by_filter_name[filter_name].format(**parameters)) diff --git a/tests/functional/filter.t b/tests/functional/filter.t index 8fcfe64b7..2aada6e39 100644 --- a/tests/functional/filter.t +++ b/tests/functional/filter.t @@ -333,7 +333,6 @@ The two highest priority strains are in these two years. > --priority filter/priorities.tsv \ > --sequences-per-group 1 \ > --output-strains "$TMP/filtered_strains.txt" > /dev/null - WARNING: no valid year, skipping strain 'COL/FLR_00024/2015' with date value of ''. $ diff -u <(sort -k 2,2rn -k 1,1 filter/priorities.tsv | head -n 2 | cut -f 1) <(sort -k 1,1 "$TMP/filtered_strains.txt") $ rm -f "$TMP/filtered_strains.txt" From aa3d91f251916307c1b83f53a683e1c64ddf0380 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 16 Aug 2021 13:11:41 -0700 Subject: [PATCH 08/14] Skip strains with missing months during group by Instead of randomly generating month values for strains that do not have any month information, skip these strains and log the reason why they were skipped. This is a breaking change to the behavior of augur filter's interface for users who rely on year-only date records to be grouped anyway. However, this change ensures that grouping results are more stable and predictable across runs. Fixes #760 --- augur/filter.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/augur/filter.py b/augur/filter.py index 2e57dabf7..7d6ba2898 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -916,7 +916,14 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): try: month = int(m["date"].split('-')[1]) except: - month = random.randint(1,12) + skipped_strains.append({ + "strain": strain, + "filter": "skip_group_by_with_ambiguous_month", + "kwargs": "", + }) + skip_strain = True + break + group.append((year, month)) else: group.append(year) From 9a17705fdf9bd3db5d2e2ba14298adb9552f418a Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 16 Aug 2021 14:17:24 -0700 Subject: [PATCH 09/14] Validate group-by arguments Validate group-by arguments to avoid downstream exceptions caused by missing number of sequences per group or maximum sequences. Confirms the behavior with a functional test. --- augur/filter.py | 8 ++++++++ tests/functional/filter.t | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/augur/filter.py b/augur/filter.py index 7d6ba2898..a92faff9b 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -1202,6 +1202,14 @@ def validate_arguments(args): file=sys.stderr) return False + # If user requested grouping, confirm that other required inputs are provided, too. + if args.group_by and not any((args.sequences_per_group, args.subsample_max_sequences)): + print( + "ERROR: You must specify a number of sequences per group or maximum sequences to subsample.", + file=sys.stderr + ) + return False + return True diff --git a/tests/functional/filter.t b/tests/functional/filter.t index 2aada6e39..c949e2652 100644 --- a/tests/functional/filter.t +++ b/tests/functional/filter.t @@ -352,3 +352,13 @@ Strains with ambiguous years or months should be dropped and logged. $ grep "COL/FLR_00024/2015" "$TMP/filtered_log.tsv" | cut -f 1-2 COL/FLR_00024/2015\tskip_group_by_with_ambiguous_year (esc) COL/FLR_00024/2015\tsubsampling (esc) + +Try to group data without any grouping arguments. +This should fail with a helpful error message. + + $ ${AUGUR} filter \ + > --metadata filter/metadata.tsv \ + > --group-by year month \ + > --output-strains "$TMP/filtered_strains.txt" > /dev/null + ERROR: You must specify a number of sequences per group or maximum sequences to subsample. + [1] From d7dcf91c7389c424cef1a1634c08a837ff17836a Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 16 Aug 2021 14:55:44 -0700 Subject: [PATCH 10/14] Fix case where no strains are dropped by grouping Handles an edge case present in the TB functional tests where all strains have ambiguous months and all strains were filtered by the initial sequence index filter or by the new ambiguous month filter from the grouping logic. --- augur/filter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/augur/filter.py b/augur/filter.py index a92faff9b..5f1ca4d0d 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -1529,6 +1529,7 @@ def run(args): # If we have any records in queues, we have grouped results and need to # stream the highest priority records to the requested outputs. + num_excluded_subsamp = 0 if queues_by_group: # Populate the set of strains to keep from the records in queues. subsampled_strains = set() From d63478c994b312b091811b23bc0fceaeacf454af Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 16 Aug 2021 15:17:19 -0700 Subject: [PATCH 11/14] Remove skipped strains from valid strains Since strains skipped during grouping are effectively filtered from the analysis, they need to be removed the set of valid strains. When we don't remove them from this list and subsampling doesn't occur (there are no valid samples), Augur reports those skipped samples as passing all filters when all samples were actually dropped. This edge case occurred when running the TB functional tests where all strains have ambiguous month values and get dropped. --- augur/filter.py | 1 + tests/functional/filter.t | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/augur/filter.py b/augur/filter.py index 5f1ca4d0d..5a1537742 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -1411,6 +1411,7 @@ def run(args): # strains were excluded from the analysis. for skipped_strain in skipped_strains: filter_counts[(skipped_strain["filter"], skipped_strain["kwargs"])] += 1 + valid_strains.remove(skipped_strain["strain"]) if args.output_log: output_log_writer.writerow(skipped_strain) diff --git a/tests/functional/filter.t b/tests/functional/filter.t index c949e2652..ba15d5acf 100644 --- a/tests/functional/filter.t +++ b/tests/functional/filter.t @@ -348,10 +348,8 @@ Strains with ambiguous years or months should be dropped and logged. > --output-log "$TMP/filtered_log.tsv" > /dev/null $ grep "SG_018" "$TMP/filtered_log.tsv" | cut -f 1-2 SG_018\tskip_group_by_with_ambiguous_month (esc) - SG_018\tsubsampling (esc) $ grep "COL/FLR_00024/2015" "$TMP/filtered_log.tsv" | cut -f 1-2 COL/FLR_00024/2015\tskip_group_by_with_ambiguous_year (esc) - COL/FLR_00024/2015\tsubsampling (esc) Try to group data without any grouping arguments. This should fail with a helpful error message. From c6ea93f9ec795ef39cf8b5f5a05d8414ee2e2039 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 16 Aug 2021 15:19:31 -0700 Subject: [PATCH 12/14] Don't group TB samples by month TB samples all have ambiguous month values, so the new group-by logic that skips ambiguous records when date information isn't available causes all records to get dropped when grouping by year and month. Instead, we can group by year alone and get a similar effect to the original test. --- tests/builds/tb/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/builds/tb/Snakefile b/tests/builds/tb/Snakefile index 7654d0e99..ff1c76a2b 100644 --- a/tests/builds/tb/Snakefile +++ b/tests/builds/tb/Snakefile @@ -40,7 +40,7 @@ rule filter: "results/filtered.vcf.gz" params: sequences_per_group = 10, - group_by = "year month", + group_by = "year", min_len = 200000 shell: """ From 978cac2d38fa2d1936b16fa9bd85a468dda7bc01 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 17 Aug 2021 08:53:21 -0700 Subject: [PATCH 13/14] Update change log for major release --- CHANGES.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index a62b0ad30..c5c4b2f37 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,13 +2,22 @@ ## __NEXT__ +### Major Changes + +* filter: Skip metadata records with ambiguous month information in the `date` column when grouping by month instead of randomly generating month values for those records. This change alters the behavior of the `filter` command for metadata with ambiguous month values. For these data, consider using `--group-by year` instead of `--group-by year month`. [#761][] (@huddlej) + +### Features + +* filter: When grouping by year or month, report the number of strains skipped due to ambiguous year and month both in the summary report at the end of filtering and in the `--output-log` contents [#761][] (@huddlej) + +[#761]: https://github.com/nextstrain/augur/pull/761 ## 12.1.1 (13 August 2021) ### Bug Fixes -* filter: Fix parsing of missing data in metadata [#758][] -* filter: Fix probabilistic sampling with small values [#759][] +* filter: Fix parsing of missing data in metadata [#758][] (@huddlej) +* filter: Fix probabilistic sampling with small values [#759][] (@huddlej) [#758]: https://github.com/nextstrain/augur/pull/758 [#759]: https://github.com/nextstrain/augur/pull/759 From be57dbed62c90276cca8ce8d8cd999fbb9bd7c62 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Tue, 17 Aug 2021 08:54:23 -0700 Subject: [PATCH 14/14] version 13.0.0 --- CHANGES.md | 3 +++ augur/__version__.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index c5c4b2f37..3d2d39c62 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,9 @@ ## __NEXT__ + +## 13.0.0 (17 August 2021) + ### Major Changes * filter: Skip metadata records with ambiguous month information in the `date` column when grouping by month instead of randomly generating month values for those records. This change alters the behavior of the `filter` command for metadata with ambiguous month values. For these data, consider using `--group-by year` instead of `--group-by year month`. [#761][] (@huddlej) diff --git a/augur/__version__.py b/augur/__version__.py index 8536b264e..8e7a3677b 100644 --- a/augur/__version__.py +++ b/augur/__version__.py @@ -1,4 +1,4 @@ -__version__ = '12.1.1' +__version__ = '13.0.0' def is_augur_version_compatible(version):