Skip to content

Commit

Permalink
simplify get_numerical_dates, fix functional tests
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin committed Aug 19, 2021
1 parent 49fa23e commit 94efe60
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 59 deletions.
2 changes: 1 addition & 1 deletion augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def filter_by_date(metadata, date_column="date", min_date=None, max_date=None):
if (not min_date and not max_date) or date_column not in metadata.columns:
return strains

dates = get_numerical_dates(metadata, date_col=date_column, fmt="%Y-%m-%d")
dates = get_numerical_dates(metadata, date_col=date_column)
filtered = {strain for strain in strains if dates[strain] is not None}

if min_date:
Expand Down
2 changes: 1 addition & 1 deletion augur/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def format_frequencies(freq):

def run(args):
metadata, columns = read_metadata(args.metadata)
dates = get_numerical_dates(metadata, fmt='%Y-%m-%d')
dates = get_numerical_dates(metadata)
stiffness = args.stiffness
inertia = args.inertia

Expand Down
4 changes: 1 addition & 3 deletions augur/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ def register_arguments(parser):
parser.add_argument('--no-covariance', dest='covariance', action='store_false') #If you set help here, it displays 'default: True' - which is confusing!
parser.add_argument('--keep-polytomies', action='store_true', help='Do not attempt to resolve polytomies')
parser.add_argument('--precision', type=int, choices=[0,1,2,3], help="precision used by TreeTime to determine the number of grid points that are used for the evaluation of the branch length interpolation objects. Values range from 0 (rough) to 3 (ultra fine) and default to 'auto'.")
parser.add_argument('--date-format', default="%Y-%m-%d", help="date format")
parser.add_argument('--date-confidence', action="store_true", help="calculate confidence intervals for node dates")
parser.add_argument('--date-inference', default='joint', choices=["joint", "marginal"],
help="assign internal nodes to their marginally most likely dates, not jointly most likely")
Expand Down Expand Up @@ -195,8 +194,7 @@ def run(args):
metadata, columns = read_metadata(args.metadata)
if args.year_bounds:
args.year_bounds.sort()
dates = get_numerical_dates(metadata, fmt=args.date_format,
min_max_year=args.year_bounds)
dates = get_numerical_dates(metadata, min_max_year=args.year_bounds)

# save input state string for later export
for n in T.get_terminals():
Expand Down
63 changes: 15 additions & 48 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,7 @@ def is_date_ambiguous(date, ambiguous_by="any"):
"X" in day and ambiguous_by in ("any", "day")
))

def get_numerical_date_from_value(value, fmt=None, min_max_year=None, raise_error=True):
if type(value)!=str:
if raise_error:
raise ValueError(value)
return None
def get_numerical_date_from_value(value, fmt=None, min_max_year=None):
if 'XX' in value:
ambig_date = ambiguous_date_to_date_range(value, fmt, min_max_year)
if ambig_date is None or None in ambig_date:
Expand All @@ -127,45 +123,13 @@ def get_numerical_date_from_value(value, fmt=None, min_max_year=None, raise_erro
except:
return None

def get_numerical_dates(meta_dict, name_col = None, date_col='date', fmt=None, min_max_year=None):
if fmt:
numerical_dates = {}

if isinstance(meta_dict, dict):
for k,m in meta_dict.items():
v = m[date_col]
try:
numerical_dates[k] = get_numerical_date_from_value(
v,
fmt,
min_max_year
)
except ValueError:
print(
"WARNING: %s has an invalid data string: %s"% (k, v),
file=sys.stderr
)
continue
elif isinstance(meta_dict, pd.DataFrame):
strains = meta_dict.index.values
dates = meta_dict[date_col].apply(
lambda date: get_numerical_date_from_value(
date,
fmt,
min_max_year,
raise_error=False
)
).values
numerical_dates = dict(zip(strains, dates))
else:
if isinstance(meta_dict, dict):
numerical_dates = {k:float(v) for k,v in meta_dict.items()}
elif isinstance(meta_dict, pd.DataFrame):
strains = meta_dict.index.values
dates = meta_dict[date_col].astype(float)
numerical_dates = dict(zip(strains, dates))

return numerical_dates
def get_numerical_dates(meta_dict, date_col='date', min_max_year=None):
if isinstance(meta_dict, dict):
return {k:to_numeric_date(m[date_col], min_max_year=min_max_year) for k,m in meta_dict.items()}
if isinstance(meta_dict, pd.DataFrame):
strains = meta_dict.index.values
dates = meta_dict[date_col].apply(lambda date: to_numeric_date(date, min_max_year=min_max_year)).values
return dict(zip(strains, dates))


def to_numeric_date_min(date):
Expand All @@ -176,7 +140,7 @@ def to_numeric_date_max(date):
return to_numeric_date(date, ambiguity_resolver="max")


def to_numeric_date(date, ambiguity_resolver="min"):
def to_numeric_date(date, ambiguity_resolver=None, min_max_year=None):
"""Return numeric date from string, [incomplete] ISO date string, or datetime.date object.
Parameters
Expand All @@ -192,24 +156,27 @@ def to_numeric_date(date, ambiguity_resolver="min"):
Returns
-------
float
float | [float, float] | None
"""
if type(date) is datetime.date:
return numeric_date(date)
if type(date) is str and "." in date:
return float(date)
if type(date) is str:
if ambiguity_resolver not in {"min", "max"}:
if date == '':
return None
if ambiguity_resolver not in {"min", "max", None}:
raise ValueError("Ambiguous date range must be resolved by taking either min or max date.")
ambiguous_date_str = generate_ambiguous_date_str(date)
ambiguous_date_resolved = get_numerical_date_from_value(ambiguous_date_str, "%Y-%m-%d")
ambiguous_date_resolved = get_numerical_date_from_value(ambiguous_date_str, "%Y-%m-%d", min_max_year)
if type(ambiguous_date_resolved) is float:
return ambiguous_date_resolved
if type(ambiguous_date_resolved) is list:
if ambiguity_resolver == "min":
return ambiguous_date_resolved[0]
if ambiguity_resolver == "max":
return ambiguous_date_resolved[1]
return ambiguous_date_resolved
raise ValueError(f"Unparsable date value: {date!r}")


Expand Down
23 changes: 19 additions & 4 deletions tests/functional/filter.t
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,10 @@ Filter using only metadata without sequence input or output and save results as
> --min-length 10500 \
> --output-metadata "$TMP/filtered_metadata.tsv" > /dev/null

Output should include the 7 sequences matching the filters and a header line.
Output should include the 8 sequences matching the filters and a header line.

$ wc -l "$TMP/filtered_metadata.tsv"
\s*8 .* (re)
\s*9 .* (re)
$ rm -f "$TMP/filtered_metadata.tsv"

Filter using only metadata and save results as a list of filtered strains.
Expand All @@ -128,12 +128,27 @@ Filter using only metadata and save results as a list of filtered strains.
> --min-length 10500 \
> --output-strains "$TMP/filtered_strains.txt" > /dev/null

Output should include only the 7 sequences matching the filters (without a header line).
Output should include only the 8 sequences matching the filters (without a header line).

$ wc -l "$TMP/filtered_strains.txt"
\s*7 .* (re)
\s*8 .* (re)
$ rm -f "$TMP/filtered_strains.txt"

Filter using min/max date and save filtered sequences in output file.

$ ${AUGUR} filter \
> --metadata filter/metadata.tsv \
> --min-date 2015 \
> --max-date 2017-01-01 \
> --output-strains "$TMP/filtered_strains.txt" \
> --output-log "$TMP/filtered_log.tsv" > /dev/null

Output should include the 1 sequences that does not have a date and a header line.

$ wc -l "$TMP/filtered_log.tsv"
\s*2 .* (re)
$ rm -f "$TMP/filtered_metadata.tsv"

Filter using only metadata without a sequence index.
This should work because the requested filters don't rely on sequence information.

Expand Down
6 changes: 4 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,11 @@ def test_is_date_ambiguous(self):

def test_to_numeric_date(self):
assert utils.to_numeric_date("2019.1") == 2019.1
assert round(utils.to_numeric_date("2019"), 8) == 2019.00136986
assert round(utils.to_numeric_date("2019-04"), 8) == 2019.24794521
assert round(utils.to_numeric_date("2019", ambiguity_resolver="min"), 8) == 2019.00136986
assert round(utils.to_numeric_date("2019-04", ambiguity_resolver="min"), 8) == 2019.24794521
assert round(utils.to_numeric_date(datetime.date(2019, 4, 11)), 8) == 2019.27534247
dates = utils.to_numeric_date("2019")
assert [round(dates[0], 8), round(dates[1], 8)] == [2019.00136986, 2019.99863014]
with pytest.raises(ValueError):
utils.to_numeric_date(False)
with pytest.raises(ValueError):
Expand Down

0 comments on commit 94efe60

Please sign in to comment.