diff --git a/augur/filter.py b/augur/filter.py index f76da2298..0924b5f8f 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -321,7 +321,7 @@ def filter_by_date(metadata, date_column="date", min_date=None, max_date=None): if (not min_date and not max_date) or date_column not in metadata.columns: return strains - dates = get_numerical_dates(metadata, date_col=date_column, fmt="%Y-%m-%d") + dates = get_numerical_dates(metadata, date_col=date_column) filtered = {strain for strain in strains if dates[strain] is not None} if min_date: diff --git a/augur/frequencies.py b/augur/frequencies.py index 9394a3d39..6d897421a 100644 --- a/augur/frequencies.py +++ b/augur/frequencies.py @@ -79,7 +79,7 @@ def format_frequencies(freq): def run(args): metadata, columns = read_metadata(args.metadata) - dates = get_numerical_dates(metadata, fmt='%Y-%m-%d') + dates = get_numerical_dates(metadata) stiffness = args.stiffness inertia = args.inertia diff --git a/augur/refine.py b/augur/refine.py index f09bc8a00..e042fea16 100644 --- a/augur/refine.py +++ b/augur/refine.py @@ -109,7 +109,6 @@ def register_arguments(parser): parser.add_argument('--no-covariance', dest='covariance', action='store_false') #If you set help here, it displays 'default: True' - which is confusing! parser.add_argument('--keep-polytomies', action='store_true', help='Do not attempt to resolve polytomies') parser.add_argument('--precision', type=int, choices=[0,1,2,3], help="precision used by TreeTime to determine the number of grid points that are used for the evaluation of the branch length interpolation objects. Values range from 0 (rough) to 3 (ultra fine) and default to 'auto'.") - parser.add_argument('--date-format', default="%Y-%m-%d", help="date format") parser.add_argument('--date-confidence', action="store_true", help="calculate confidence intervals for node dates") parser.add_argument('--date-inference', default='joint', choices=["joint", "marginal"], help="assign internal nodes to their marginally most likely dates, not jointly most likely") @@ -195,8 +194,7 @@ def run(args): metadata, columns = read_metadata(args.metadata) if args.year_bounds: args.year_bounds.sort() - dates = get_numerical_dates(metadata, fmt=args.date_format, - min_max_year=args.year_bounds) + dates = get_numerical_dates(metadata, min_max_year=args.year_bounds) # save input state string for later export for n in T.get_terminals(): diff --git a/augur/utils.py b/augur/utils.py index 976b08005..14fa386cd 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -111,11 +111,7 @@ def is_date_ambiguous(date, ambiguous_by="any"): "X" in day and ambiguous_by in ("any", "day") )) -def get_numerical_date_from_value(value, fmt=None, min_max_year=None, raise_error=True): - if type(value)!=str: - if raise_error: - raise ValueError(value) - return None +def get_numerical_date_from_value(value, fmt=None, min_max_year=None): if 'XX' in value: ambig_date = ambiguous_date_to_date_range(value, fmt, min_max_year) if ambig_date is None or None in ambig_date: @@ -127,45 +123,13 @@ def get_numerical_date_from_value(value, fmt=None, min_max_year=None, raise_erro except: return None -def get_numerical_dates(meta_dict, name_col = None, date_col='date', fmt=None, min_max_year=None): - if fmt: - numerical_dates = {} - - if isinstance(meta_dict, dict): - for k,m in meta_dict.items(): - v = m[date_col] - try: - numerical_dates[k] = get_numerical_date_from_value( - v, - fmt, - min_max_year - ) - except ValueError: - print( - "WARNING: %s has an invalid data string: %s"% (k, v), - file=sys.stderr - ) - continue - elif isinstance(meta_dict, pd.DataFrame): - strains = meta_dict.index.values - dates = meta_dict[date_col].apply( - lambda date: get_numerical_date_from_value( - date, - fmt, - min_max_year, - raise_error=False - ) - ).values - numerical_dates = dict(zip(strains, dates)) - else: - if isinstance(meta_dict, dict): - numerical_dates = {k:float(v) for k,v in meta_dict.items()} - elif isinstance(meta_dict, pd.DataFrame): - strains = meta_dict.index.values - dates = meta_dict[date_col].astype(float) - numerical_dates = dict(zip(strains, dates)) - - return numerical_dates +def get_numerical_dates(meta_dict, date_col='date', min_max_year=None): + if isinstance(meta_dict, dict): + return {k:to_numeric_date(m[date_col], min_max_year=min_max_year) for k,m in meta_dict.items()} + if isinstance(meta_dict, pd.DataFrame): + strains = meta_dict.index.values + dates = meta_dict[date_col].apply(lambda date: to_numeric_date(date, min_max_year=min_max_year)).values + return dict(zip(strains, dates)) def to_numeric_date_min(date): @@ -176,7 +140,7 @@ def to_numeric_date_max(date): return to_numeric_date(date, ambiguity_resolver="max") -def to_numeric_date(date, ambiguity_resolver="min"): +def to_numeric_date(date, ambiguity_resolver=None, min_max_year=None): """Return numeric date from string, [incomplete] ISO date string, or datetime.date object. Parameters @@ -192,17 +156,19 @@ def to_numeric_date(date, ambiguity_resolver="min"): Returns ------- - float + float | [float, float] | None """ if type(date) is datetime.date: return numeric_date(date) if type(date) is str and "." in date: return float(date) if type(date) is str: - if ambiguity_resolver not in {"min", "max"}: + if date == '': + return None + if ambiguity_resolver not in {"min", "max", None}: raise ValueError("Ambiguous date range must be resolved by taking either min or max date.") ambiguous_date_str = generate_ambiguous_date_str(date) - ambiguous_date_resolved = get_numerical_date_from_value(ambiguous_date_str, "%Y-%m-%d") + ambiguous_date_resolved = get_numerical_date_from_value(ambiguous_date_str, "%Y-%m-%d", min_max_year) if type(ambiguous_date_resolved) is float: return ambiguous_date_resolved if type(ambiguous_date_resolved) is list: @@ -210,6 +176,7 @@ def to_numeric_date(date, ambiguity_resolver="min"): return ambiguous_date_resolved[0] if ambiguity_resolver == "max": return ambiguous_date_resolved[1] + return ambiguous_date_resolved raise ValueError(f"Unparsable date value: {date!r}") diff --git a/tests/functional/filter.t b/tests/functional/filter.t index ba15d5acf..05551fe4a 100644 --- a/tests/functional/filter.t +++ b/tests/functional/filter.t @@ -113,10 +113,10 @@ Filter using only metadata without sequence input or output and save results as > --min-length 10500 \ > --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null -Output should include the 7 sequences matching the filters and a header line. +Output should include the 8 sequences matching the filters and a header line. $ wc -l "$TMP/filtered_metadata.tsv" - \s*8 .* (re) + \s*9 .* (re) $ rm -f "$TMP/filtered_metadata.tsv" Filter using only metadata and save results as a list of filtered strains. @@ -128,12 +128,27 @@ Filter using only metadata and save results as a list of filtered strains. > --min-length 10500 \ > --output-strains "$TMP/filtered_strains.txt" > /dev/null -Output should include only the 7 sequences matching the filters (without a header line). +Output should include only the 8 sequences matching the filters (without a header line). $ wc -l "$TMP/filtered_strains.txt" - \s*7 .* (re) + \s*8 .* (re) $ rm -f "$TMP/filtered_strains.txt" +Filter using min/max date and save filtered sequences in output file. + + $ ${AUGUR} filter \ + > --metadata filter/metadata.tsv \ + > --min-date 2015 \ + > --max-date 2017-01-01 \ + > --output-strains "$TMP/filtered_strains.txt" \ + > --output-log "$TMP/filtered_log.tsv" > /dev/null + +Output should include the 1 sequences that does not have a date and a header line. + + $ wc -l "$TMP/filtered_log.tsv" + \s*2 .* (re) + $ rm -f "$TMP/filtered_metadata.tsv" + Filter using only metadata without a sequence index. This should work because the requested filters don't rely on sequence information. diff --git a/tests/test_utils.py b/tests/test_utils.py index 24d94819a..efa3d5b1a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -133,9 +133,11 @@ def test_is_date_ambiguous(self): def test_to_numeric_date(self): assert utils.to_numeric_date("2019.1") == 2019.1 - assert round(utils.to_numeric_date("2019"), 8) == 2019.00136986 - assert round(utils.to_numeric_date("2019-04"), 8) == 2019.24794521 + assert round(utils.to_numeric_date("2019", ambiguity_resolver="min"), 8) == 2019.00136986 + assert round(utils.to_numeric_date("2019-04", ambiguity_resolver="min"), 8) == 2019.24794521 assert round(utils.to_numeric_date(datetime.date(2019, 4, 11)), 8) == 2019.27534247 + dates = utils.to_numeric_date("2019") + assert [round(dates[0], 8), round(dates[1], 8)] == [2019.00136986, 2019.99863014] with pytest.raises(ValueError): utils.to_numeric_date(False) with pytest.raises(ValueError):