Skip to content

Commit

Permalink
dataset query: test backends, engines, parsers; add docstring
Browse files Browse the repository at this point in the history
  • Loading branch information
alimanfoo committed Mar 12, 2021
1 parent 8b542f8 commit a41e805
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 22 deletions.
1 change: 1 addition & 0 deletions ci/requirements/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ dependencies:
- nc-time-axis
- netcdf4
- numba
- numexpr
- numpy
- pandas
- pint
Expand Down
50 changes: 46 additions & 4 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6988,7 +6988,52 @@ def query(
missing_dims: str = "raise",
**queries_kwargs: Any,
) -> "Dataset":
"""TODO docstring"""
"""Return a new dataset with each array indexed along the specified
dimension(s), where the indexers are given as strings containing
Python expressions to be evaluated against the data variables in the
dataset.
Parameters
----------
queries : dict, optional
A dic with keys matching dimensions and values given by strings
containing Python expressions to be evaluated against the data variables
in the dataset. The expressions will be evaluated using the pandas
eval() function, and can contain any valid Python expressions but cannot
contain any Python statements.
parser : {"pandas", "python"}, default: "pandas"
The parser to use to construct the syntax tree from the expression.
The default of 'pandas' parses code slightly different than standard
Python. Alternatively, you can parse an expression using the 'python'
parser to retain strict Python semantics.
engine: {"python", "numexpr", None}, default: None
The engine used to evaluate the expression. Supported engines are:
- None: tries to use numexpr, falls back to python
- "numexpr": evaluates expressions using numexpr
- "python": performs operations as if you had eval’d in top level python
missing_dims : {"raise", "warn", "ignore"}, default: "raise"
What to do if dimensions that should be selected from are not present in the
Dataset:
- "raise": raise an exception
- "warning": raise a warning, and ignore the missing dimensions
- "ignore": ignore the missing dimensions
**queries_kwargs : {dim: query, ...}, optional
The keyword arguments form of ``queries``.
One of queries or queries_kwargs must be provided.
Returns
-------
obj : Dataset
A new Dataset with the same contents as this dataset, except each
array and dimension is indexed by the results of the appropriate
queries.
See Also
--------
Dataset.isel
pandas.eval
"""

# allow queries to be given either as a dict or as kwargs
queries = either_dict_or_kwargs(queries, queries_kwargs, "query")
Expand All @@ -6998,16 +7043,13 @@ def query(
if not isinstance(expr, str):
msg = f"expr for dim {dim} must be a string to be evaluated, {type(expr)} given"
raise ValueError(msg)
# TODO check missing dims here, or delegate to isel?

# evaluate the queries to create the indexers
indexers = {
dim: pd.eval(expr, resolvers=[self], parser=parser, engine=engine)
for dim, expr in queries.items()
}

# TODO any validation of indexers? Or just let isel try to handle it?

# apply the selection
return self.isel(indexers, missing_dims=missing_dims)

Expand Down
56 changes: 38 additions & 18 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5807,62 +5807,82 @@ def test_astype_attrs(self):
assert not data.astype(float, keep_attrs=False).attrs
assert not data.astype(float, keep_attrs=False).var1.attrs

def test_query_single_dim(self):
"""Test querying a single dimension."""
@pytest.mark.parametrize("parser", ["pandas", "python"])
@pytest.mark.parametrize("engine", ["python", "numexpr", None])
@pytest.mark.parametrize("backend", ["numpy", "dask"])
def test_query(self, backend, engine, parser):
"""Test querying a dataset."""

# setup test data
np.random.seed(42)
a = np.arange(0, 10, 1)
b = np.random.randint(0, 100, size=10)
c = np.linspace(0, 1, 20)
d = np.arange(0, 200).reshape(10, 20)
ds = Dataset(
{"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)}
)
if backend == "numpy":
ds = Dataset(
{"a": ("x", a), "b": ("x", b), "c": ("y", c), "d": (("x", "y"), d)}
)
elif backend == "dask":
ds = Dataset(
{
"a": ("x", da.from_array(a, chunks=3)),
"b": ("x", da.from_array(b, chunks=3)),
"c": ("y", da.from_array(c, chunks=7)),
"d": (("x", "y"), da.from_array(d, chunks=(3, 7))),
}
)

# query single dim, single variable
actual = ds.query(x="a > 5")
actual = ds.query(x="a > 5", engine=engine, parser=parser)
expect = ds.isel(x=(a > 5))
assert_identical(expect, actual)

# query single dim, single variable, via dict
actual = ds.query(dict(x="a > 5"))
actual = ds.query(dict(x="a > 5"), engine=engine, parser=parser)
expect = ds.isel(dict(x=(a > 5)))
assert_identical(expect, actual)

# query single dim, single variable
actual = ds.query(x="b > 50")
actual = ds.query(x="b > 50", engine=engine, parser=parser)
expect = ds.isel(x=(b > 50))
assert_identical(expect, actual)

# query single dim, single variable
actual = ds.query(y="c < .5")
actual = ds.query(y="c < .5", engine=engine, parser=parser)
expect = ds.isel(y=(c < 0.5))
assert_identical(expect, actual)

# query single dim, multiple variables
actual = ds.query(x="(a > 5) & (b > 50)")
actual = ds.query(x="(a > 5) & (b > 50)", engine=engine, parser=parser)
expect = ds.isel(x=((a > 5) & (b > 50)))
assert_identical(expect, actual)

# support pandas query parser
actual = ds.query(x="(a > 5) and (b > 50)")
expect = ds.isel(x=((a > 5) & (b > 50)))
assert_identical(expect, actual)
if parser == "pandas":
actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser)
expect = ds.isel(x=((a > 5) & (b > 50)))
assert_identical(expect, actual)

# query multiple dims via kwargs
actual = ds.query(x="a > 5", y="c < .5")
actual = ds.query(x="a > 5", y="c < .5", engine=engine, parser=parser)
expect = ds.isel(x=(a > 5), y=(c < 0.5))
assert_identical(expect, actual)

# query multiple dims via dict
actual = ds.query(dict(x="a > 5", y="c < .5"))
actual = ds.query(dict(x="a > 5", y="c < .5"), engine=engine, parser=parser)
expect = ds.isel(dict(x=(a > 5), y=(c < 0.5)))
assert_identical(expect, actual)

# TODO test error handling

# TODO test dask data variables
# test error handling
with pytest.raises(ValueError):
ds.query("a > 5") # must be dict
with pytest.raises(IndexError):
ds.query(y="a > 5") # wrong length dimension
with pytest.raises(IndexError):
ds.query(x="c < .5") # wrong length dimension
with pytest.raises(IndexError):
ds.query(x="d > 100") # wrong number of dimensions


# Py.test tests
Expand Down

0 comments on commit a41e805

Please sign in to comment.