From a1bfce484097fb3e1638b3c507793f3b8c6270c1 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Fri, 13 Aug 2021 13:24:34 -0700
Subject: [PATCH 01/14] Skip parsing of missing data in metadata

Fixes an issue with metadata parsing where pandas treats missing
data (e.g., empty strings) as `NaN` values, missing date values (which
appear in some GenBank SARS-CoV-2 records) get interpretted as `float`
values, and downstream functions that expect strings (like
`is_date_ambiguous`) break. This commit resolves the issue by restoring
Augur's original behavior which was to fill all missing values in a data
frame with empty strings [1]. However, we use a slightly different
approach here of asking pandas not to parse missing data at all through
the `na_filter=False` argument. This argument has the same effect as the
previous implementation, but it only needs to be written once to apply
for all calls of `read_metadata`, whereas the `fillna` approach would
need to be applied to data frames or data frame chunks in different
contexts.

To recreate the original issue, this commit updates the functional tests
for augur filter to include a metadata record with a missing date
column and updates the other parts of the functional tests accordingly.

[1] https://github.com/nextstrain/augur/blob/15dfc8b/augur/util_support/metadata_file.py#L97
---
 augur/io.py                          | 1 +
 tests/functional/filter.t            | 8 ++++----
 tests/functional/filter/metadata.tsv | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/augur/io.py b/augur/io.py
index 51567b4ac..683e57e05 100644
--- a/augur/io.py
+++ b/augur/io.py
@@ -84,6 +84,7 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None)
         "sep": None,
         "engine": "python",
         "skipinitialspace": True,
+        "na_filter": False,
     }
 
     if chunk_size:
diff --git a/tests/functional/filter.t b/tests/functional/filter.t
index c9cbe5285..db7b9ef44 100644
--- a/tests/functional/filter.t
+++ b/tests/functional/filter.t
@@ -113,10 +113,10 @@ Filter using only metadata without sequence input or output and save results as
   >  --min-length 10500 \
   >  --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null
 
-Output should include the 8 sequences matching the filters and a header line.
+Output should include the 7 sequences matching the filters and a header line.
 
   $ wc -l "$TMP/filtered_metadata.tsv"
-  \s*9 .* (re)
+  \s*8 .* (re)
   $ rm -f "$TMP/filtered_metadata.tsv"
 
 Filter using only metadata and save results as a list of filtered strains.
@@ -128,10 +128,10 @@ Filter using only metadata and save results as a list of filtered strains.
   >  --min-length 10500 \
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
 
-Output should include only the 8 sequences matching the filters (without a header line).
+Output should include only the 7 sequences matching the filters (without a header line).
 
   $ wc -l "$TMP/filtered_strains.txt"
-  \s*8 .* (re)
+  \s*7 .* (re)
   $ rm -f "$TMP/filtered_strains.txt"
 
 Filter using only metadata without a sequence index.
diff --git a/tests/functional/filter/metadata.tsv b/tests/functional/filter/metadata.tsv
index dc66a193c..40e2cca39 100644
--- a/tests/functional/filter/metadata.tsv
+++ b/tests/functional/filter/metadata.tsv
@@ -1,5 +1,5 @@
 strain	virus	accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url
-COL/FLR_00024/2015	zika	MF574569	2015-12-XX	South America	Colombia	Colombia	Colombia	genbank	genome	Pickett et al	https://www.ncbi.nlm.nih.gov/nuccore/MF574569	Direct Submission	Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
+COL/FLR_00024/2015	zika	MF574569		South America	Colombia	Colombia	Colombia	genbank	genome	Pickett et al	https://www.ncbi.nlm.nih.gov/nuccore/MF574569	Direct Submission	Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
 PRVABC59	zika	KU501215	2015-12-XX	North America	Puerto Rico	Puerto Rico	Puerto Rico	genbank	genome	Lanciotti et al	https://www.ncbi.nlm.nih.gov/nuccore/KU501215	Phylogeny of Zika Virus in Western Hemisphere, 2015	Emerging Infect. Dis. 22 (5), 933-935 (2016)	https://www.ncbi.nlm.nih.gov/pubmed/27088323
 COL/FLR_00008/2015	zika	MF574562	2015-12-XX	South America	Colombia	Colombia	Colombia	genbank	genome	Pickett et al	https://www.ncbi.nlm.nih.gov/nuccore/MF574562	Direct Submission	Submitted (28-JUL-2017) J. Craig Venter Institute, 9704 Medical Center Drive, Rockville, MD 20850, USA	https://www.ncbi.nlm.nih.gov/pubmed/
 Colombia/2016/ZC204Se	zika	KY317939	2016-01-06	South America	Colombia	Colombia	Colombia	genbank	genome	Quick et al	https://www.ncbi.nlm.nih.gov/nuccore/KY317939	Multiplex PCR method for MinION and Illumina sequencing of Zika and other virus genomes directly from clinical samples	Nat Protoc 12 (6), 1261-1276 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538739

From beb93cd82e97582bb80e02631fb06730d20c1214 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Fri, 13 Aug 2021 13:31:00 -0700
Subject: [PATCH 02/14] Skip strains in group-by year when date is missing

Fixes a previously hidden bug where grouping by `year` in augur filter
would include strains with missing dates as a separate additional group
with a missing year value. The original code used a `continue` statement
that was intended to continue to the next strain, but because this
statement was inside another for loop, it only continued to the new
group.

This commit fixes the issue by assigning an explicit boolean variable
that tracks whether a strain should be skipped or not. We assign this
variable to `True` when we can't parse a year from the strain's date
string, print a clearer warning message to stderr, and break from the
loop (instead of continuing).

This commit includes updates to the functional tests (that originally
caught this previously hidden bug!) to reflect this new output to stderr
and the highest priority strains that should be included from each
group.
---
 augur/filter.py                        | 9 ++++++---
 tests/functional/filter.t              | 1 +
 tests/functional/filter/priorities.tsv | 4 ++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/augur/filter.py b/augur/filter.py
index c2270b20f..631aea792 100644
--- a/augur/filter.py
+++ b/augur/filter.py
@@ -863,6 +863,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
 
     group_by_strain = {}
     for strain in strains:
+        skip_strain = False
         group = []
         m = metadata.loc[strain].to_dict()
         # collect group specifiers
@@ -875,8 +876,9 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
                 try:
                     year = int(m["date"].split('-')[0])
                 except:
-                    print("WARNING: no valid year, skipping",strain, m["date"])
-                    continue
+                    print(f"WARNING: no valid year, skipping strain '{strain}' with date value of '{m['date']}'.", file=sys.stderr)
+                    skip_strain = True
+                    break
                 if c=='month':
                     try:
                         month = int(m["date"].split('-')[1])
@@ -888,7 +890,8 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
             else:
                 group.append('unknown')
 
-        group_by_strain[strain] = tuple(group)
+        if not skip_strain:
+            group_by_strain[strain] = tuple(group)
 
     # If we could not find any requested categories, we cannot complete subsampling.
     distinct_groups = set(group_by_strain.values())
diff --git a/tests/functional/filter.t b/tests/functional/filter.t
index db7b9ef44..bf5a64996 100644
--- a/tests/functional/filter.t
+++ b/tests/functional/filter.t
@@ -333,6 +333,7 @@ The two highest priority strains are in these two years.
   >  --priority filter/priorities.tsv \
   >  --sequences-per-group 1 \
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  WARNING: no valid year, skipping strain 'COL/FLR_00024/2015' with date value of ''.
 
   $ diff -u <(sort -k 2,2rn -k 1,1 filter/priorities.tsv | head -n 2 | cut -f 1) <(sort -k 1,1 "$TMP/filtered_strains.txt")
   $ rm -f "$TMP/filtered_strains.txt"
diff --git a/tests/functional/filter/priorities.tsv b/tests/functional/filter/priorities.tsv
index 8f960697f..86734afb4 100644
--- a/tests/functional/filter/priorities.tsv
+++ b/tests/functional/filter/priorities.tsv
@@ -1,5 +1,5 @@
-COL/FLR_00024/2015	100
-PRVABC59	50
+COL/FLR_00024/2015	50
+PRVABC59	100
 COL/FLR_00008/2015	10
 Colombia/2016/ZC204Se	10
 ZKC2/2016	100

From 5db04fc846b890f9b90d41e3e14eb90b197ef4d9 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Fri, 13 Aug 2021 15:00:02 -0700
Subject: [PATCH 03/14] Fix probabilistic subsampling for small values

Fixes a regression from the rewrite of augur filter where probabilistic
subsampling for small values of `--subsample-max-sequences` could
randomly select zero strains and randomly cause our CI tests to fail.
Prior to the rewrite of augur filter and introduction of priority
queues, we fixed this issue by repeatedly attempting to calculate
sequences per group that summed to an integer value greater than zero.
However, the way I implemented random queue sizes inside the `PriorityQueue`
class in the rewrite prevented me from using a similar "multiple
attempts" approach.

This commit redesigns the way we create priority queues. In the case
where we already know the number of sequences per group in the first
pass, we create an appropriately-sized priority queue for each group as
we encounter it. There is no possibility that the sum of these queue
sizes could be zero.

In the case where we need to calculate the number of sequences per group
from the first pass, we already know all possible groups and can create
their priority queues in bulk. The new `create_queues_by_group` function
allows us to create fixed-sized or randomly-sized queues and also make
multiple attempts when queue sizes sum to zero. As a result, the
`PriorityQueue` class is much simpler (it requires no logic about random
max sizes) and we can actually test the fixed and random behaviors more
carefully with doctests for `create_queues_by_group`.
---
 augur/filter.py | 96 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 63 insertions(+), 33 deletions(-)

diff --git a/augur/filter.py b/augur/filter.py
index 631aea792..a39dbcb68 100644
--- a/augur/filter.py
+++ b/augur/filter.py
@@ -24,7 +24,6 @@
 from .utils import is_vcf as filename_is_vcf, read_vcf, read_strains, get_numerical_dates, run_shell_command, shquote, is_date_ambiguous
 
 comment_char = '#'
-MAX_NUMBER_OF_PROBABILISTIC_SAMPLING_ATTEMPTS = 10
 
 SEQUENCE_ONLY_FILTERS = (
     "min_length",
@@ -936,11 +935,6 @@ class PriorityQueue:
     """A priority queue implementation that automatically replaces lower priority
     items in the heap with incoming higher priority items.
 
-    This implementation also allows the maximum size to be a fractional value
-    less than 1 in which case the heap size is sampled randomly from a Poisson
-    distribution with the given maximum size as the mean. This randomly sized
-    heap enables probabilistic subsampling.
-
     Add a single record to a heap with a maximum of 2 records.
 
     >>> queue = PriorityQueue(max_size=2)
@@ -974,28 +968,12 @@ class PriorityQueue:
     >>> list(queue.get_items())
     [{'strain': 'strain4'}, {'strain': 'strain3'}]
 
-    Assign a fractional maximum size such that the corresponding queue limit is
-    sampled randomly from a Poisson distribution. For small values, we should
-    get a max size that is no more than 10 (this is an arbitrarily high number
-    above what we see for Poisson samples drawn with a mean of 0.1).
-
-    >>> queue = PriorityQueue(max_size=0.1)
-    >>> queue.max_size in set(range(10))
-    True
-
     """
     def __init__(self, max_size):
-        """Create a fixed size heap (priority queue) that allows the maximum size to be
-        calculated probabilistically from a Poisson process.
+        """Create a fixed size heap (priority queue)
 
         """
-        # Fractional heap sizes indicate probabilistic sampling.
-        if max_size < 1.0:
-            random_generator = np.random.default_rng()
-            self.max_size = random_generator.poisson(max_size)
-        else:
-            self.max_size = max_size
-
+        self.max_size = max_size
         self.heap = []
         self.counter = itertools.count()
 
@@ -1030,10 +1008,53 @@ def get_items(self):
             yield item
 
 
-def priority_queue_factory(max_size):
-    """Return a callable for a priority queue with the given arguments.
+def create_queues_by_group(groups, max_size, max_attempts=100):
+    """Create a dictionary of priority queues per group for the given maximum size.
+
+    When the maximum size is fractional, probabilistically sample the maximum
+    size from a Poisson distribution. Make at least the given number of maximum
+    attempts to create queues for which the sum of their maximum sizes is
+    greater than zero.
+
+    Create queues for two groups with a fixed maximum size.
+
+    >>> groups = ("2015", "2016")
+    >>> queues = create_queues_by_group(groups, 2)
+    >>> sum(queue.max_size for queue in queues.values())
+    4
+
+    Create queues for two groups with a fractional maximum size. Their total max
+    size should still be an integer value greater than zero.
+
+    >>> queues = create_queues_by_group(groups, 0.1)
+    >>> int(sum(queue.max_size for queue in queues.values())) > 0
+    True
+
     """
-    return lambda: PriorityQueue(max_size=max_size)
+    queues_by_group = {}
+    total_max_size = 0
+    attempts = 0
+
+    if max_size < 1.0:
+        random_generator = np.random.default_rng()
+
+    # For small fractional maximum sizes, it is possible to randomly select
+    # maximum queue sizes that all equal zero. When this happens, filtering
+    # fails unexpectedly. We make multiple attempts to create queues with
+    # maximum sizes greater than zero for at least one queue.
+    while total_max_size == 0 and attempts < max_attempts:
+        for group in groups:
+            if max_size < 1.0:
+                queue_max_size = random_generator.poisson(max_size)
+            else:
+                queue_max_size = max_size
+
+            queues_by_group[group] = PriorityQueue(queue_max_size)
+
+        total_max_size = sum(queue.max_size for queue in queues_by_group.values())
+        attempts += 1
+
+    return queues_by_group
 
 
 def register_arguments(parser):
@@ -1347,11 +1368,17 @@ def run(args):
                     # Track the highest priority records, when we already
                     # know the number of sequences allowed per group.
                     if queues_by_group is None:
-                        queues_by_group = defaultdict(priority_queue_factory(
-                            max_size=sequences_per_group,
-                        ))
+                        queues_by_group = {}
 
                     for strain, group in group_by_strain.items():
+                        # During this first pass, we do not know all possible
+                        # groups will be, so we need to build each group's queue
+                        # as we first encounter the group.
+                        if group not in queues_by_group:
+                            queues_by_group[group] = PriorityQueue(
+                                max_size=sequences_per_group,
+                            )
+
                         queues_by_group[group].add(
                             metadata.loc[strain],
                             priorities[strain],
@@ -1411,9 +1438,12 @@ def run(args):
             sys.exit(1)
 
         if queues_by_group is None:
-            queues_by_group = defaultdict(priority_queue_factory(
-                max_size=sequences_per_group,
-            ))
+            # We know all of the possible groups now from the first pass through
+            # the metadata, so we can create queues for all groups at once.
+            queues_by_group = create_queues_by_group(
+                records_per_group.keys(),
+                sequences_per_group,
+            )
 
         # Make a second pass through the metadata, only considering records that
         # have passed filters.

From a30b1a9befd85a8d23ac43348aa57cb01d8396d9 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Fri, 13 Aug 2021 16:20:33 -0700
Subject: [PATCH 04/14] Update for patch release

---
 CHANGES.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGES.md b/CHANGES.md
index 066d7be54..5b5993e52 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -2,6 +2,13 @@
 
 ## __NEXT__
 
+### Bug Fixes
+
+* filter: Fix parsing of missing data in metadata [#758][]
+* filter: Fix probabilistic sampling with small values [#759][]
+
+[#758]: https://github.com/nextstrain/augur/pull/758
+[#759]: https://github.com/nextstrain/augur/pull/759
 
 ## 12.1.0 (12 August 2021)
 

From 5c89570b456b29bc70377592784f6828eaef8840 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Fri, 13 Aug 2021 16:24:41 -0700
Subject: [PATCH 05/14] version 12.1.1

---
 CHANGES.md           | 3 +++
 augur/__version__.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGES.md b/CHANGES.md
index 5b5993e52..a62b0ad30 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -2,6 +2,9 @@
 
 ## __NEXT__
 
+
+## 12.1.1 (13 August 2021)
+
 ### Bug Fixes
 
 * filter: Fix parsing of missing data in metadata [#758][]
diff --git a/augur/__version__.py b/augur/__version__.py
index 70b686f24..8536b264e 100644
--- a/augur/__version__.py
+++ b/augur/__version__.py
@@ -1,4 +1,4 @@
-__version__ = '12.1.0'
+__version__ = '12.1.1'
 
 
 def is_augur_version_compatible(version):

From 6b13599863ff42bb92482b5eb68eb31d86b8f546 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 16 Aug 2021 12:21:33 -0700
Subject: [PATCH 06/14] Test grouping ambiguous dates by month

Adds a functional test for grouping ambiguous dates by year and month
and corresponding metadata, sequence, and sequence index entry for a
strain with only a year in the date field. The test fails because the
ambiguous month gets generated twice for the same strain during the
augur filter command and the group from the first pass through the
metadata doesn't exist in the second pass.

Confirms #760
---
 augur/io.py                                |   2 +-
 tests/functional/filter.t                  |  28 +++-
 tests/functional/filter/metadata.tsv       |   1 +
 tests/functional/filter/sequence_index.tsv |   1 +
 tests/functional/filter/sequences.fasta    | 179 +++++++++++++++++++++
 5 files changed, 204 insertions(+), 7 deletions(-)

diff --git a/augur/io.py b/augur/io.py
index 683e57e05..92514109d 100644
--- a/augur/io.py
+++ b/augur/io.py
@@ -77,7 +77,7 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None)
     ...
     (5, 14)
     (5, 14)
-    (1, 14)
+    (2, 14)
 
     """
     kwargs = {
diff --git a/tests/functional/filter.t b/tests/functional/filter.t
index bf5a64996..8fcfe64b7 100644
--- a/tests/functional/filter.t
+++ b/tests/functional/filter.t
@@ -9,14 +9,14 @@ Force include one South American record by country to get two total records.
 
   $ ${AUGUR} filter \
   >  --metadata filter/metadata.tsv \
-  >  --exclude-where "region=South America" "region=North America" \
+  >  --exclude-where "region=South America" "region=North America" "region=Southeast Asia" \
   >  --include-where "country=Ecuador" \
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
   $ wc -l "$TMP/filtered_strains.txt"
   \s*2 .* (re)
   $ rm -f "$TMP/filtered_strains.txt"
 
-Filter with subsampling, requesting 1 sequence per group (for a group with 3 distinct values).
+Filter with subsampling, requesting 1 sequence per group (for a group with 4 distinct values).
 
   $ ${AUGUR} filter \
   >  --metadata filter/metadata.tsv \
@@ -24,7 +24,7 @@ Filter with subsampling, requesting 1 sequence per group (for a group with 3 dis
   >  --sequences-per-group 1 \
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
   $ wc -l "$TMP/filtered_strains.txt"
-  \s*3 .* (re)
+  \s*4 .* (re)
   $ rm -f "$TMP/filtered_strains.txt"
 
 Filter with subsampling, requesting no more than 8 sequences.
@@ -225,7 +225,7 @@ Metadata should have the same number of records as the sequences plus a header.
   \s*5 .* (re)
   $ rm -f "$TMP/filtered.tsv"
 
-Alternately, exclude only the sequences from Brazil and Colombia (12 - 4 strains).
+Alternately, exclude the sequences from Brazil and Colombia (N=4) and records without sequences (N=1) or metadata (N=1).
 
   $ ${AUGUR} filter \
   >  --sequences filter/sequences.fasta \
@@ -234,7 +234,7 @@ Alternately, exclude only the sequences from Brazil and Colombia (12 - 4 strains
   >  --exclude "$TMP/filtered_strains.brazil.txt" "$TMP/filtered_strains.colombia.txt" \
   >  --output "$TMP/filtered.fasta" > /dev/null
   $ grep "^>" "$TMP/filtered.fasta" | wc -l
-  \s*6 (re)
+  \s*7 (re)
   $ rm -f "$TMP/filtered.fasta"
 
 Try to filter with sequences that don't match any of the metadata.
@@ -318,7 +318,7 @@ The query initially filters 3 strains from Colombia, one of which is added back
   \t1 had no sequence data (esc)
   \t3 of these were filtered out by the query: "country != 'Colombia'" (esc)
   \t1 strains were added back because they were in filter/include.txt (esc)
-  8 strains passed all filters
+  9 strains passed all filters
 
   $ diff -u <(sort -k 1,1 filter/filtered_log.tsv) <(sort -k 1,1 "$TMP/filtered_log.tsv")
   $ rm -f "$TMP/filtered_strains.txt"
@@ -337,3 +337,19 @@ The two highest priority strains are in these two years.
 
   $ diff -u <(sort -k 2,2rn -k 1,1 filter/priorities.tsv | head -n 2 | cut -f 1) <(sort -k 1,1 "$TMP/filtered_strains.txt")
   $ rm -f "$TMP/filtered_strains.txt"
+
+Try to subsample a maximum number of sequences by year and month, given metadata with ambiguous year and month values.
+Strains with ambiguous years or months should be dropped and logged.
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --group-by year month \
+  >  --subsample-max-sequences 5 \
+  >  --output-strains "$TMP/filtered_strains.txt" \
+  >  --output-log "$TMP/filtered_log.tsv" > /dev/null
+  $ grep "SG_018" "$TMP/filtered_log.tsv" | cut -f 1-2
+  SG_018\tskip_group_by_with_ambiguous_month (esc)
+  SG_018\tsubsampling (esc)
+  $ grep "COL/FLR_00024/2015" "$TMP/filtered_log.tsv" | cut -f 1-2
+  COL/FLR_00024/2015\tskip_group_by_with_ambiguous_year (esc)
+  COL/FLR_00024/2015\tsubsampling (esc)
diff --git a/tests/functional/filter/metadata.tsv b/tests/functional/filter/metadata.tsv
index 40e2cca39..3de583bf8 100644
--- a/tests/functional/filter/metadata.tsv
+++ b/tests/functional/filter/metadata.tsv
@@ -10,3 +10,4 @@ BRA/2016/FC_6706	zika	KY785433	2016-04-08	South America	Brazil	Brazil	Brazil	gen
 DOM/2016/BB_0183	zika	KY785420	2016-04-18	North America	Dominican Republic	Dominican Republic	Dominican Republic	genbank	genome	Metsky et al	https://www.ncbi.nlm.nih.gov/nuccore/KY785420	Zika virus evolution and spread in the Americas	Nature 546 (7658), 411-415 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538734
 EcEs062_16	zika	KX879603	2016-04-XX	South America	Ecuador	Ecuador	Ecuador	genbank	genome	Marquez et al	https://www.ncbi.nlm.nih.gov/nuccore/KX879603	First Complete Genome Sequences of Zika Virus Isolated from Febrile Patient Sera in Ecuador	Genome Announc 5 (8), e01673-16 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28232448
 HND/2016/HU_ME59	zika	KY785418	2016-05-13	North America	Honduras	Honduras	Honduras	genbank	genome	Metsky et al	https://www.ncbi.nlm.nih.gov/nuccore/KY785418	Zika virus evolution and spread in the Americas	Nature 546 (7658), 411-415 (2017)	https://www.ncbi.nlm.nih.gov/pubmed/28538734
+SG_018	zika	KY241688	2016	Southeast Asia	Singapore	Singapore	Singapore	genbank	genome	Ho et al	https://www.ncbi.nlm.nih.gov/nuccore/KY241688	Outbreak of Zika virus infection in Singapore: an epidemiological, entomological, virological, and clinical analysis	Lancet Infect Dis (2017) In press	https://www.ncbi.nlm.nih.gov/pubmed/
diff --git a/tests/functional/filter/sequence_index.tsv b/tests/functional/filter/sequence_index.tsv
index 16dafa0bd..4a034c607 100644
--- a/tests/functional/filter/sequence_index.tsv
+++ b/tests/functional/filter/sequence_index.tsv
@@ -10,3 +10,4 @@ DOM/2016/BB_0059	10035	2563	2089	2741	2015	621	6	0	0	0
 BRA/2016/FC_6706	10366	2747	2203	2915	2165	329	7	0	0	0
 DOM/2016/BB_0183	10621	2910	2343	3099	2269	0	0	0	0	0
 EcEs062_16	10812	2960	2388	3158	2306	0	0	0	0	0
+SG_018	10659	2906	2367	3110	2257	19	0	0	0	0
diff --git a/tests/functional/filter/sequences.fasta b/tests/functional/filter/sequences.fasta
index 8149494fc..f306f7474 100644
--- a/tests/functional/filter/sequences.fasta
+++ b/tests/functional/filter/sequences.fasta
@@ -1962,3 +1962,182 @@ ggcctgaactggagatcagctgtggatctccagaagagggactagtggttagaggagacc
 ccccggaaaacgcaaaacagcatattgacgctgggaaagaccagagactccatgagtttc
 caccacgctggccgccaggcacagatcgccgaatagcggcggccggtgtggggaaatcca
 tgggagatcgga
+>SG_018
+atgnnnnnnnnnnnnnnnnnnnccggaggattccggattgtcaatatgctaaaacgcgga
+gtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggacttctgctgggt
+catgggcccatcaggatggtcttggcgattctagcctttttgaggttcacggcaatcaag
+ccatcactgggtctcatcaatagatggggttcagtggggaaaaaagaggctatggaaata
+ataaagaagttcaagaaagatctggctgccatgctgagaataatcaatgctaggaaggag
+aagaagagacgaggcgcagatactagtgtcggaattgttggcctcctgctgaccacagct
+atggcagcggaggtcactagacgtgggagtgcatactatatgtacttggacagaagcgat
+gctggggaggccatatcttttccaaccacactggggatgaataagtgttatatacagatc
+atggatcttggacacatgtgtgatgccaccatgagctatgaatgccctatgctggatgag
+ggggtagaaccagatgacgtcgattgttggtgcaacacgacgtcagcttgggttgtgtac
+gtaacctgccatcacaaaaaaggtgaagcacggagatctagaagagctgtgacgctcccc
+tcccattccactaggaagctgcaaacgcggtcgcagacctggttggagtcaagagaatac
+acaaagcacttgattagagtcgaaaattggatattcaggaaccctggcttcgcgttagca
+gcagctgccatcgcttggcttttgggaagctcaacgagccaaaaagtcatatacttggtc
+atgatactgctgattgccccggcatacagcatcaggtgcataggagtcagcaatagggac
+tttgtggaaggtatgtcaggtgggacttgggttgatgttgtcttggaacatggaggttgt
+gtcaccgtaatggcacaggacaaaccgactgtcgacatagagctggttacaacaacagtc
+agcaacatggcggaggtaagatcctactgctatgaggcatcaatatcggacatggcttcg
+gacagccgctgcccaacacaaggtgaagcctaccttgacaagcaatcagatacccaatat
+gtctgcaaaagaacgttagtggacagaggctggggaaatggatgtggactttttggaaaa
+gggagcctggtgacatgcgctaagtttgcatgctccaagaaaatgaccgggaagagcatc
+cagccagagaatctggagtaccggataatgctgtcagtccatggctcccagcacagtggg
+atgatcgttaatgacacaggacatgaaactgatgagaatagagcgaaggttgagataacg
+cccaattcaccaagagccgaagccaccctggggggttttggaagcctaggacttgattgt
+gaaccgaggacaggccttgacttttcagatttgtattacttgactatgaataacaagcac
+tggttggttcacaaggagtggttccacgacattccattaccttggcacgctggggcagac
+accggaactccacactggaacaacaaagaagcactggtagagttcaaggacgcacatgcc
+aaaaggcaaactgtcgtggttctagggagtcaagaaggagcagttcacacggcccttgct
+ggagctctggaggctgagatggatggtgcaaagggaaggctgtcctctggccacttgaaa
+tgtcgcctgaaaatggacaaacttagattgaagggcgtgtcatactccttgtgtaccgca
+gcgttcacattcaccaagatcccggctgaaacactgcacgggacagtcacagtggaggta
+cagtacgcagggacagatggaccctgcaaggttccagctcagatggcggtggacatgcaa
+actctgaccccagttgggaggttgataaccgctaaccccgtaatcactgaaagcactgag
+aactctaagatgatgctggaacttgatccaccatttggggactcttacattgtcatagga
+gtcggggagaagaagatcacccaccactggcacaggagtggcagtaccattggaaaagca
+tttgaagccactgtgagaggtgccaagagaatggcagtcttgggagacacagcctgggac
+tttggatcagttggaggcgctctcaactcattgggcaagggcatccatcaaatttttgga
+gcagctttcaaatcattgtttggaggaatgtcctggttctcacaaattctcattggaacg
+ttgctgatgtggttgggcctgaacacaaagaatggatctatttcccttatgtgcttggcc
+ttagggggagtgttgatcttcttatccacagccgtctctgctgatgtggggtgctcggtg
+gacttctcaaagaaggaaacgagatgcggtacaggggtgttcgtctataacgacgttgaa
+gcctggagggacaggtacaagtaccatcctgactccccccgtagattggcagcagcagtc
+aagcaagcctgggaagatggtatctgtgggatctcctctgtttcaagaatggaaaacatc
+atgtggagatcagtagaaggggagctcaacgcaatcctggaagaaaatggagttcaactg
+acggtcgttgtgggatctgtaaaaaaccccatgtggagaggtccacagagactgcccgtg
+cctgtgaacgagctgccccacggctggaaggcttgggggaaatcgtacttcgtcagagca
+gcaaagacaaataacagctttgtcgtggatggtgacacactgaaggaatgcccactcaaa
+catagagcatggaacagctttcttgtggaggatcatgggttcggggtatttcacactagt
+gtctggctcaaggttagagaagattattcattagagtgtgatcctgccgttattggaaca
+gctgttaagggaaaggaggctgcacacagtgatctaggctactggattgagagtgagaag
+aatgacacatggaggctgaagagggcccacctgatcgagatgaaaacatgtgaatggcca
+aagtcccacacattgtggacagatggaatagaagagagtgatctgatcatacccaagtct
+ttagctgggccactcagccatcacaacaccagagagggctacaggacccaaatgaaaggg
+ccatggcacagtgaagagcttgaaattcggtttgaggaatgcccaggcactaaggtccac
+gtggaggaaacatgtggaacaagaggaccgtctctgagatcaaccactgcaagcggaagg
+gtgatcgaggaatggtgctgcagggagtgcacaatgcccccactgtcgttccgggctaaa
+gatggctgttggtatggaatggagataaggcccaggaaagaaccagaaagtaacttagta
+aggtcaatggtgactgcaggatcaactgatcacatggatcacttctcccttggagtgctt
+gtgattctgctcatggtgcaggaagggctgaagaagagaatgaccacaaagatcatcata
+agcacatcaatggcagtgctggtagctatgatcctgggaggattttcaatgagtgacctg
+gctaagcttgcaattttgatgggtgccaccttcgcggaaatgaacactggaggagatgta
+gctcatctggcgctgatagcggcattcaaagtcagaccagcgttgctggtatctttcatc
+ttcagagctaattggacaccccgtgaaagcatgctgctggccttggcctcgtgtcttttg
+caaactgcgatttccgccttggaaggcgacctgatggttctcatcaatggttttgctctg
+gcctggttggcaatacgagcgatggttgttccacgcactgacaacatcaccttggcaatc
+ctggctgctctgacaccactggcccggggcacactgcttgtggcgtggagagcaggcctt
+gctacttgcggggggttcatgctcctctctctgaagggaaaaggcagtgtgaagaagaac
+ttaccatttgtcatggccctgggactaaccgctgtgaggctagtcgaccccatcaacgtg
+gtgggactgctgttgctcacaaggagtgggaagcggagctggccccctagcgaagtactc
+acagctgttggcctgatatgcgcattggctggagggttcgccaaggcagatatagagatg
+gctgggcccatggccgcggttggtctgctaattgtcagttacgtggtctcaggaaagagt
+gtggacatgtacattgaaagagcaggtgacatcacatgggaaaaagatgcggaagtcact
+ggaaacagtccccggctcgatgtggcactagatgagagtggtgatttctccctggtggag
+gatgacggtccccccatgagagagatcatactcaaagtggtcctgatgaccatctgtggc
+atgaacccaatagccataccctttgcagctggagcgtggtacgtatacgtgaagactgga
+aaaaggagtggtgctctatgggatgtgcctgctcccaaggaagtaaaaaagggggagacc
+acagatggagtgtacagagtaatgactcgtagactgctaggttcaacacaagttggagtg
+ggagttatgcaagagggggtctttcacactatgtggcacgtcacaaaaggatccgcactg
+agaagcggtgaagggagacttgatccatactggggagatgtcaagcaggatctggtgtca
+tactgtggtccatggaagctagatgccgcctgggacgggcacagcgaggtgcagctcttg
+gccgtgccccccggagagagagcgaggaacatccagactctgcccggaatatttaagaca
+aaggatggggacattggagcggttgcgctggactacccagcaggaacttcaggatctcca
+atcctagacaagtgtgggagagtgataggactctatggcaatggggtcgtgatcaaaaat
+gggagttatgttagtgccatcacccaagggaggagggaggaagagactcctgttgagtgc
+ttcgagccttcgatgctgaagaagaagcagctaactgtcttagacttgcatcctggagct
+gggaaaaccaggagagttcttcctgaaatagtccgtgaagccataaaaacaagactccgt
+actgtgatcttagctccaactagggttgtcgctgctgaaatggaggaagcccttagaggg
+cttccagtgcgttatatgacaacagcagtcaatgtcacccactctgggacagaaatcgtt
+gacttaatgtgccatgccaccttcacttcacgtctactacagccaatcagagtccccaac
+tataatctgtatattatggatgaggcccacttcacagatccctcaagtatagcagcaaga
+ggatacatttcaacaagggttgagatgggcgaggcggctgccatcttcatgaccgccacg
+ccaccaggaacccgtgacgcatttccggactccaactcaccaattatggacaccgaagtg
+gaagtcccggagagagcctggagctcaggctttgattgggtgacggaccattctggaaaa
+acagtttggtttgttccaagcgtgaggaacggcaatgagatcgcagcttgtctgacgaag
+gctggaaaacgggtcatacagctcagcagaaagacttttgagacagagttccagaaaaca
+aaacatcaagagtgggactttgtcgtgacaactgacatttcagagatgggcgccaacttt
+aaagctgaccgtgtcatagattccaggagatgcctaaagccggtcatacttgatggcgag
+agagtcattctggctggacccatgcctgtcacacatgccagcgctgcccagaggaggggg
+cgcataggcaggaatcccaacaaacctggagatgagtatctgtatggaggtgggtgcgca
+gagactgatgaagaccatgcacactggcttgaagcaagaatgctccttgacaatatctac
+ctccaagatggcctcatagcctcgctctatcgacctgaggccgacaaagtagcagccatt
+gagggagagttcaagcttaggacggagcaaaggaagacctttgtggaactcatgaaaaga
+ggagatcttcctgtttggctggcctatcaggttgcatctgccggaataacctacacagat
+agaagatggtgctttgatggcacgaccaacaacaccataatggaagacagtgtgccggca
+gaggtgtggaccagatacggagagaaaagagtgctcaaaccgaggtggatggacgccaga
+gtttgttcagatcatgcggccctgaagtcattcaaggagtttgccgctgggaaaagagga
+gcggcttttggagtgatggaagccctgggaacactgccaggacacatgacagagagattc
+caggaagccattgacaacctcgctgtgctcatgcgggcagagactggaagcagaccttac
+aaagccgcggcggcccaattgccggagaccctagagaccattatgcttttggggttgctg
+ggaacagtctcgctgggaatctttttcgtcttgatgcggaacaagggcatagggaagatg
+ggctttggaatggtgactcttggggccagcgcatggctcatgtggctctcggaaattgag
+ccagccagaattgcatgtgtcctcattgttgtgttcctattgctggtggtgctcatacct
+gagccagaaaagcaaagatctccccaggacaaccaaatggcaatcatcatcatggtagca
+gtaggtcttctgggcttgattaccgccaatgaactcggatggttggagagaacaaagagt
+gacctaagccatctaatgggaaggagagaggagggggcaaccataggattctcaatggac
+attgacctgcggccagcctcagcttgggccatctatgctgccctgacaactttcattacc
+ccagccgtccaacatgcagtgaccacttcatacaacaactactccttaatggcgatggcc
+acgcaagctggagtgttgtttggtatgggcaaagggatgccattctacgcatgggacttt
+ggagtcccgctgctaatgataggttgctactcacaattaacacccctgaccctaatagtg
+gccatcattttgctcgtggcgcactacatgtacttgatcccagggctgcaggcagcagct
+gcgcgtgctgcccagaagagaacggcagctggcatcatgaagaaccctgttgtggatgga
+atagtggtgactgacattgacacaatgacaattgacccccaagtggagaaaaagatggga
+caggtgctactcatagcagtagccgtctccagcgccatactgtcgcggaccgcctggggg
+tggggggaggctggggccctgatcacagctgcaacttccactttgtgggaaggctctccg
+aacaagtactggaactcctctacagccacttcactgtgtaacatttttaggggaagttac
+ttggctggagcttctctaatctacacagtaacaagaaacgctggtttggtcaagagacgt
+gggggtggaacaggagagaccctgggagagaaatggaaggcccgcttgaaccagatgtcg
+gccctggagttctactcctacaaaaagtcaggcatcaccgaggtgtgcagagaagaggcc
+cgccgcgccctcaaggacggtgtggcaacgggaggccatgctgtgtcccggggaagtgca
+aagctgagatggttggtggagcggggatacctgcagccctatggaaaggtcattgatctt
+ggatgtggcagagggggctggagttactacgccgccaccatccgcaaagttcaagaagtg
+aaaggatacacaaaaggaggccctggtcatgaagaacccatgttggtgcaaagctatggg
+tggaacatagtccgtcttaagagtggggtggacgtctttcatatggcggctgagccgtgt
+gacacgttgctgtgtgacataggtgagtcatcatctagtcctgaagtggaagaagcacgg
+acgctcagagtcctctccatggtgggggattggcttgaaaaaagaccaggagccttttgt
+ataaaagtgttgtgcccatacaccagcactatgatggaaaccctggagcgactgcagcgt
+aggtatgggggaggactggtcagagtgccactctcccgcaactcaacacatgagatgtac
+tgggtctctggagcgaaaagcaacaccataaaaagtgtgtccaccacgagccagctcctc
+ttggggcgcatggacgggcccaggaggccagtgaaatatgaggaggatgtgaatctcggc
+tctggcacgcgggctgtggtaagctgcgctgaagctcccaacatgaagatcattggtaac
+cgcattgaaaggatccgcagtgagcacgcggaaacgtggttctttgacgagaaccaccca
+tataggacatgggcttaccatggaagctatgaggcccccacacaagggtcagcgtcctct
+ctaataaacggggttgtcaggctcctgtcaaaaccctgggatgtggtgactggagtcaca
+ggaatagccatgaccgacaccacaccgtatggtcagcaaagagttttcaaggaaaaagtg
+gacactagggtgccagacccccaagaaggcactcgtcaggttatgagcatggtctcttcc
+tggttgtggaaagagctaggcaaacacaaacggccacgagtctgtaccaaagaagagttc
+atcaacaaggttcgtagcaatgcagcattaggggcaatatttgaagaggaaaaagagtgg
+aagactgcagtggaagctgtgaacgatccaaggttctgggctctagtggacaaggaaaga
+gagcaccacctgagaggagagtgccagagctgtgtgtacaacatgatgggaaaaagagaa
+aagaaacaaggggaatttggaaaggccaagggcagccgcgccatctggtatatgtggcta
+ggggctagatttctagagttcgaagcccttggattcttgaacgaggatcactggatgggg
+agagagaactcaggaggtggtgttgaagggctgggattacaaagactcggatatgtccta
+gaagagatgagtcgcataccaggaggaaggatgtatgcagatgacactgctggctgggac
+acccgcatcagcaggtttgatctggagaatgaagctctaatcaccaaccaaatggagaaa
+gggcacagggccttggcattggccataatcaagtacacataccaaaacaaagtggtaaag
+gtccttagaccagctgaaaaagggaagacagtgatggacattatttcaagacaagaccaa
+agggggagcggacaagttgtcacttacgctctcaacacatttaccaacctagtggtgcaa
+ctcattcggaatatggaggctgaggaagttctagagatgcaagacttgtggctgctgcgg
+aggtcagagaaagtgaccaactggttgcagagcaacggatgggataggctcaaacgaatg
+gcagtcagtggagatgattgcgttgtgaagccaattgatgataggtttgcacatgccctc
+aggttcttgaatgatatggggaaagttaggaaggacacacaagaatggaaaccctcaact
+ggatgggacaactgggaagaagttccgttttgctcccaccatttcaacaagctccatctc
+aaggacgggaggtccattgtggttccctgccgccaccaagatgaactgattggccgggcc
+cgcgtctctccaggggcgggatggagcatccgggagactgcttgcctagcaaaatcatat
+gcgcagatgtggcagctcctttatttccacagaagggacctccgactgatggccaatgcc
+atttgttcatctgtgccagttgactgggttccaactgggagaactacctggtcaatccat
+ggaaagggagaatggatgaccactgaagacatgcttgtggtgtggaacagagtgtggatt
+gaggagaacgaccacatggaagacaagaccccagttacgaaatggacagacattccctat
+ttgggaaaaagggaagacttatggtgtggatctctcatagggcacagaccgcgcaccacc
+tgggctgagaacattaaaaacacagtcaacatggtgcgcagaatcataggtgatgaagaa
+aagtacatggactacctatccacccaagttcgctacttgggtgaggaagggtctacacct
+ggagtgctgtaagcaccaatcttagtgttgtcaggcctgctagtcagccacagcttgggg
+aaagctgtgcagcctgtgacccccccaggagaagctgggaaaccaagcccatagtcaggc
+cgagaacgccatggcacggaagaagccatgctgcctgtgagcccctcagaggacactgag
+tcaaaaaaccccacgcgcttggaggcgcaggatgggaaaagaaggtggcgaccttcccca
+cccttcaatctggggcctgaactggagatcagctgtggatctccagaagagggactagtg
+gttagaggagaccccccggaaaacgcaaaacagcatattgacgctgggaaagaccagaga
+ctccatgagtttccaccacgctggccgccaggcacagat

From 54051140e5510bd64fdd70d44c711edef388dd7a Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 16 Aug 2021 13:09:09 -0700
Subject: [PATCH 07/14] Track strains skipped during grouping

Updates the interface to the `get_groups_for_subsampling` function such
that it returns a list of strains that have been skipped and the reason
why (following the pattern of filtered and force-included strains lists
from the `apply_filters` function). We count the number of strains
skipped for different reasons, report this number in the final output,
and also optionally log these strains to the output log table. This
change allows users to track why specific strains were excluded from an
analysis after the fact without needing to refer to the logs for stdout
or stderr.
---
 augur/filter.py           | 66 +++++++++++++++++++++++++++++++++------
 tests/functional/filter.t |  1 -
 2 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/augur/filter.py b/augur/filter.py
index a39dbcb68..2e57dabf7 100644
--- a/augur/filter.py
+++ b/augur/filter.py
@@ -820,23 +820,30 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
     -------
     dict :
         A mapping of strain names to tuples corresponding to the values of the strain's group.
+    list :
+        A list of dictionaries with strains that were skipped from grouping and the reason why (see also: `apply_filters` output).
 
     >>> strains = ["strain1", "strain2"]
     >>> metadata = pd.DataFrame([{"strain": "strain1", "date": "2020-01-01", "region": "Africa"}, {"strain": "strain2", "date": "2020-02-01", "region": "Europe"}]).set_index("strain")
     >>> group_by = ["region"]
-    >>> get_groups_for_subsampling(strains, metadata, group_by)
+    >>> group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, group_by)
+    >>> group_by_strain
     {'strain1': ('Africa',), 'strain2': ('Europe',)}
+    >>> skipped_strains
+    []
 
     If we group by year or month, these groups are calculated from the date
     string.
 
     >>> group_by = ["year", "month"]
-    >>> get_groups_for_subsampling(strains, metadata, group_by)
+    >>> group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, group_by)
+    >>> group_by_strain
     {'strain1': (2020, (2020, 1)), 'strain2': (2020, (2020, 2))}
 
     If we omit the grouping columns, the result will group by a dummy column.
 
-    >>> get_groups_for_subsampling(strains, metadata)
+    >>> group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata)
+    >>> group_by_strain
     {'strain1': ('_dummy',), 'strain2': ('_dummy',)}
 
     If we try to group by columns that don't exist, we get an error.
@@ -851,9 +858,31 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
     grouping to continue and print a warning message to stderr.
 
     >>> group_by = ["year", "month", "missing_column"]
-    >>> get_groups_for_subsampling(strains, metadata, group_by)
+    >>> group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, group_by)
+    >>> group_by_strain
     {'strain1': (2020, (2020, 1), 'unknown'), 'strain2': (2020, (2020, 2), 'unknown')}
 
+    If we group by year month and some records don't have that information in
+    their date fields, we should skip those records from the group output and
+    track which records were skipped for which reasons.
+
+    >>> metadata = pd.DataFrame([{"strain": "strain1", "date": "", "region": "Africa"}, {"strain": "strain2", "date": "2020-02-01", "region": "Europe"}]).set_index("strain")
+    >>> group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, ["year"])
+    >>> group_by_strain
+    {'strain2': (2020,)}
+    >>> skipped_strains
+    [{'strain': 'strain1', 'filter': 'skip_group_by_with_ambiguous_year', 'kwargs': ''}]
+
+    Similarly, if we group by month, we should skip records that don't have
+    month information in their date fields.
+
+    >>> metadata = pd.DataFrame([{"strain": "strain1", "date": "2020", "region": "Africa"}, {"strain": "strain2", "date": "2020-02-01", "region": "Europe"}]).set_index("strain")
+    >>> group_by_strain, skipped_strains = get_groups_for_subsampling(strains, metadata, ["month"])
+    >>> group_by_strain
+    {'strain2': ((2020, 2),)}
+    >>> skipped_strains
+    [{'strain': 'strain1', 'filter': 'skip_group_by_with_ambiguous_month', 'kwargs': ''}]
+
     """
     if group_by:
         groups = group_by
@@ -861,6 +890,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
         groups = ("_dummy",)
 
     group_by_strain = {}
+    skipped_strains = []
     for strain in strains:
         skip_strain = False
         group = []
@@ -875,7 +905,11 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
                 try:
                     year = int(m["date"].split('-')[0])
                 except:
-                    print(f"WARNING: no valid year, skipping strain '{strain}' with date value of '{m['date']}'.", file=sys.stderr)
+                    skipped_strains.append({
+                        "strain": strain,
+                        "filter": "skip_group_by_with_ambiguous_year",
+                        "kwargs": "",
+                    })
                     skip_strain = True
                     break
                 if c=='month':
@@ -928,7 +962,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
             file=sys.stderr,
         )
 
-    return group_by_strain
+    return group_by_strain, skipped_strains
 
 
 class PriorityQueue:
@@ -1352,12 +1386,20 @@ def run(args):
             # count the number of records per group. First, we need to get
             # the groups for the given records.
             try:
-                group_by_strain = get_groups_for_subsampling(
+                group_by_strain, skipped_strains = get_groups_for_subsampling(
                     seq_keep,
                     metadata,
                     group_by,
                 )
 
+                # Track strains skipped during grouping, so users know why those
+                # strains were excluded from the analysis.
+                for skipped_strain in skipped_strains:
+                    filter_counts[(skipped_strain["filter"], skipped_strain["kwargs"])] += 1
+
+                    if args.output_log:
+                        output_log_writer.writerow(skipped_strain)
+
                 if args.subsample_max_sequences and records_per_group is not None:
                     # Count the number of records per group. We will use this
                     # information to calculate the number of sequences per group
@@ -1458,7 +1500,7 @@ def run(args):
             # during the first pass, but we want to minimize overall memory
             # usage at the moment.
             seq_keep = set(metadata.index.values) & valid_strains
-            group_by_strain = get_groups_for_subsampling(
+            group_by_strain, skipped_strains = get_groups_for_subsampling(
                 seq_keep,
                 metadata,
                 group_by,
@@ -1594,11 +1636,17 @@ def run(args):
         "filter_by_date": "{count} of these were dropped because of their date (or lack of date)",
         "filter_by_sequence_length": "{count} of these were dropped because they were shorter than minimum length of {min_length}bp",
         "filter_by_non_nucleotide": "{count} of these were dropped because they had non-nucleotide characters",
+        "skip_group_by_with_ambiguous_year": "{count} were dropped during grouping due to ambiguous year information",
+        "skip_group_by_with_ambiguous_month": "{count} were dropped during grouping due to ambiguous month information",
         "include": "{count} strains were added back because they were in {include_file}",
         "include_by_include_where": "{count} sequences were added back because of '{include_where}'",
     }
     for (filter_name, filter_kwargs), count in filter_counts.items():
-        parameters = dict(json.loads(filter_kwargs))
+        if filter_kwargs:
+            parameters = dict(json.loads(filter_kwargs))
+        else:
+            parameters = {}
+
         parameters["count"] = count
         print("\t" + report_template_by_filter_name[filter_name].format(**parameters))
 
diff --git a/tests/functional/filter.t b/tests/functional/filter.t
index 8fcfe64b7..2aada6e39 100644
--- a/tests/functional/filter.t
+++ b/tests/functional/filter.t
@@ -333,7 +333,6 @@ The two highest priority strains are in these two years.
   >  --priority filter/priorities.tsv \
   >  --sequences-per-group 1 \
   >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
-  WARNING: no valid year, skipping strain 'COL/FLR_00024/2015' with date value of ''.
 
   $ diff -u <(sort -k 2,2rn -k 1,1 filter/priorities.tsv | head -n 2 | cut -f 1) <(sort -k 1,1 "$TMP/filtered_strains.txt")
   $ rm -f "$TMP/filtered_strains.txt"

From aa3d91f251916307c1b83f53a683e1c64ddf0380 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 16 Aug 2021 13:11:41 -0700
Subject: [PATCH 08/14] Skip strains with missing months during group by

Instead of randomly generating month values for strains that do not have
any month information, skip these strains and log the reason why they
were skipped. This is a breaking change to the behavior of augur
filter's interface for users who rely on year-only date records to be
grouped anyway. However, this change ensures that grouping results are
more stable and predictable across runs.

Fixes #760
---
 augur/filter.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/augur/filter.py b/augur/filter.py
index 2e57dabf7..7d6ba2898 100644
--- a/augur/filter.py
+++ b/augur/filter.py
@@ -916,7 +916,14 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
                     try:
                         month = int(m["date"].split('-')[1])
                     except:
-                        month = random.randint(1,12)
+                        skipped_strains.append({
+                            "strain": strain,
+                            "filter": "skip_group_by_with_ambiguous_month",
+                            "kwargs": "",
+                        })
+                        skip_strain = True
+                        break
+
                     group.append((year, month))
                 else:
                     group.append(year)

From 9a17705fdf9bd3db5d2e2ba14298adb9552f418a Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 16 Aug 2021 14:17:24 -0700
Subject: [PATCH 09/14] Validate group-by arguments

Validate group-by arguments to avoid downstream exceptions caused by
missing number of sequences per group or maximum sequences. Confirms the
behavior with a functional test.
---
 augur/filter.py           |  8 ++++++++
 tests/functional/filter.t | 10 ++++++++++
 2 files changed, 18 insertions(+)

diff --git a/augur/filter.py b/augur/filter.py
index 7d6ba2898..a92faff9b 100644
--- a/augur/filter.py
+++ b/augur/filter.py
@@ -1202,6 +1202,14 @@ def validate_arguments(args):
                   file=sys.stderr)
             return False
 
+    # If user requested grouping, confirm that other required inputs are provided, too.
+    if args.group_by and not any((args.sequences_per_group, args.subsample_max_sequences)):
+        print(
+            "ERROR: You must specify a number of sequences per group or maximum sequences to subsample.",
+            file=sys.stderr
+        )
+        return False
+
     return True
 
 
diff --git a/tests/functional/filter.t b/tests/functional/filter.t
index 2aada6e39..c949e2652 100644
--- a/tests/functional/filter.t
+++ b/tests/functional/filter.t
@@ -352,3 +352,13 @@ Strains with ambiguous years or months should be dropped and logged.
   $ grep "COL/FLR_00024/2015" "$TMP/filtered_log.tsv" | cut -f 1-2
   COL/FLR_00024/2015\tskip_group_by_with_ambiguous_year (esc)
   COL/FLR_00024/2015\tsubsampling (esc)
+
+Try to group data without any grouping arguments.
+This should fail with a helpful error message.
+
+  $ ${AUGUR} filter \
+  >  --metadata filter/metadata.tsv \
+  >  --group-by year month \
+  >  --output-strains "$TMP/filtered_strains.txt" > /dev/null
+  ERROR: You must specify a number of sequences per group or maximum sequences to subsample.
+  [1]

From d7dcf91c7389c424cef1a1634c08a837ff17836a Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 16 Aug 2021 14:55:44 -0700
Subject: [PATCH 10/14] Fix case where no strains are dropped by grouping

Handles an edge case present in the TB functional tests where all
strains have ambiguous months and all strains were filtered by the
initial sequence index filter or by the new ambiguous month filter from
the grouping logic.
---
 augur/filter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/augur/filter.py b/augur/filter.py
index a92faff9b..5f1ca4d0d 100644
--- a/augur/filter.py
+++ b/augur/filter.py
@@ -1529,6 +1529,7 @@ def run(args):
 
     # If we have any records in queues, we have grouped results and need to
     # stream the highest priority records to the requested outputs.
+    num_excluded_subsamp = 0
     if queues_by_group:
         # Populate the set of strains to keep from the records in queues.
         subsampled_strains = set()

From d63478c994b312b091811b23bc0fceaeacf454af Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 16 Aug 2021 15:17:19 -0700
Subject: [PATCH 11/14] Remove skipped strains from valid strains

Since strains skipped during grouping are effectively filtered from the
analysis, they need to be removed the set of valid strains. When we
don't remove them from this list and subsampling doesn't occur (there
are no valid samples), Augur reports those skipped samples as passing
all filters when all samples were actually dropped. This edge case
occurred when running the TB functional tests where all strains have
ambiguous month values and get dropped.
---
 augur/filter.py           | 1 +
 tests/functional/filter.t | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/augur/filter.py b/augur/filter.py
index 5f1ca4d0d..5a1537742 100644
--- a/augur/filter.py
+++ b/augur/filter.py
@@ -1411,6 +1411,7 @@ def run(args):
                 # strains were excluded from the analysis.
                 for skipped_strain in skipped_strains:
                     filter_counts[(skipped_strain["filter"], skipped_strain["kwargs"])] += 1
+                    valid_strains.remove(skipped_strain["strain"])
 
                     if args.output_log:
                         output_log_writer.writerow(skipped_strain)
diff --git a/tests/functional/filter.t b/tests/functional/filter.t
index c949e2652..ba15d5acf 100644
--- a/tests/functional/filter.t
+++ b/tests/functional/filter.t
@@ -348,10 +348,8 @@ Strains with ambiguous years or months should be dropped and logged.
   >  --output-log "$TMP/filtered_log.tsv" > /dev/null
   $ grep "SG_018" "$TMP/filtered_log.tsv" | cut -f 1-2
   SG_018\tskip_group_by_with_ambiguous_month (esc)
-  SG_018\tsubsampling (esc)
   $ grep "COL/FLR_00024/2015" "$TMP/filtered_log.tsv" | cut -f 1-2
   COL/FLR_00024/2015\tskip_group_by_with_ambiguous_year (esc)
-  COL/FLR_00024/2015\tsubsampling (esc)
 
 Try to group data without any grouping arguments.
 This should fail with a helpful error message.

From c6ea93f9ec795ef39cf8b5f5a05d8414ee2e2039 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 16 Aug 2021 15:19:31 -0700
Subject: [PATCH 12/14] Don't group TB samples by month

TB samples all have ambiguous month values, so the new group-by logic
that skips ambiguous records when date information isn't available
causes all records to get dropped when grouping by year and month.
Instead, we can group by year alone and get a similar effect to the
original test.
---
 tests/builds/tb/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/builds/tb/Snakefile b/tests/builds/tb/Snakefile
index 7654d0e99..ff1c76a2b 100644
--- a/tests/builds/tb/Snakefile
+++ b/tests/builds/tb/Snakefile
@@ -40,7 +40,7 @@ rule filter:
         "results/filtered.vcf.gz"
     params:
         sequences_per_group = 10,
-        group_by = "year month",
+        group_by = "year",
         min_len = 200000
     shell:
         """

From 978cac2d38fa2d1936b16fa9bd85a468dda7bc01 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 17 Aug 2021 08:53:21 -0700
Subject: [PATCH 13/14] Update change log for major release

---
 CHANGES.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index a62b0ad30..c5c4b2f37 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -2,13 +2,22 @@
 
 ## __NEXT__
 
+### Major Changes
+
+* filter: Skip metadata records with ambiguous month information in the `date` column when grouping by month instead of randomly generating month values for those records. This change alters the behavior of the `filter` command for metadata with ambiguous month values. For these data, consider using `--group-by year` instead of `--group-by year month`. [#761][] (@huddlej)
+
+### Features
+
+* filter: When grouping by year or month, report the number of strains skipped due to ambiguous year and month both in the summary report at the end of filtering and in the `--output-log` contents [#761][] (@huddlej)
+
+[#761]: https://github.com/nextstrain/augur/pull/761
 
 ## 12.1.1 (13 August 2021)
 
 ### Bug Fixes
 
-* filter: Fix parsing of missing data in metadata [#758][]
-* filter: Fix probabilistic sampling with small values [#759][]
+* filter: Fix parsing of missing data in metadata [#758][] (@huddlej)
+* filter: Fix probabilistic sampling with small values [#759][] (@huddlej)
 
 [#758]: https://github.com/nextstrain/augur/pull/758
 [#759]: https://github.com/nextstrain/augur/pull/759

From be57dbed62c90276cca8ce8d8cd999fbb9bd7c62 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Tue, 17 Aug 2021 08:54:23 -0700
Subject: [PATCH 14/14] version 13.0.0

---
 CHANGES.md           | 3 +++
 augur/__version__.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGES.md b/CHANGES.md
index c5c4b2f37..3d2d39c62 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -2,6 +2,9 @@
 
 ## __NEXT__
 
+
+## 13.0.0 (17 August 2021)
+
 ### Major Changes
 
 * filter: Skip metadata records with ambiguous month information in the `date` column when grouping by month instead of randomly generating month values for those records. This change alters the behavior of the `filter` command for metadata with ambiguous month values. For these data, consider using `--group-by year` instead of `--group-by year month`. [#761][] (@huddlej)
diff --git a/augur/__version__.py b/augur/__version__.py
index 8536b264e..8e7a3677b 100644
--- a/augur/__version__.py
+++ b/augur/__version__.py
@@ -1,4 +1,4 @@
-__version__ = '12.1.1'
+__version__ = '13.0.0'
 
 
 def is_augur_version_compatible(version):