Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add median() to ndframe #176

Merged
merged 2 commits into from
Apr 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions eland/ndframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,11 +252,95 @@ def min(self, numeric_only=True):
return self._query_compiler.min(numeric_only=numeric_only)

def var(self, numeric_only=True):
"""
Return variance for each numeric column

Returns
-------
pandas.Series
The value of the variance for each numeric column

See Also
--------
:pandas_api_docs:`pandas.DataFrame.var`

Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.var() # doctest: +SKIP
AvgTicketPrice 7.096185e+04
Cancelled 1.119831e-01
DistanceKilometers 2.096049e+07
DistanceMiles 8.092892e+06
FlightDelay 1.880825e-01
FlightDelayMin 9.359209e+03
FlightTimeHour 3.112545e+01
FlightTimeMin 1.120516e+05
dayOfWeek 3.761135e+00
dtype: float64
"""
return self._query_compiler.var(numeric_only=numeric_only)

def std(self, numeric_only=True):
"""
Return standard deviation for each numeric column

Returns
-------
pandas.Series
The value of the standard deviation for each numeric column

See Also
--------
:pandas_api_docs:`pandas.DataFrame.std`

Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.std() # doctest: +SKIP
AvgTicketPrice 266.386661
Cancelled 0.334639
DistanceKilometers 4578.263193
DistanceMiles 2844.800855
FlightDelay 0.433685
FlightDelayMin 96.743006
FlightTimeHour 5.579019
FlightTimeMin 334.741135
dayOfWeek 1.939365
dtype: float64
"""
return self._query_compiler.std(numeric_only=numeric_only)

def median(self, numeric_only=True):
"""
Return the median value for each numeric column

Returns
-------
pandas.Series
median value for each numeric column

See Also
--------
:pandas_api_docs:`pandas.DataFrame.median`

Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.median() # doctest: +SKIP
AvgTicketPrice 640.387285
Cancelled 0.000000
DistanceKilometers 7612.072403
DistanceMiles 4729.922470
FlightDelay 0.000000
FlightDelayMin 0.000000
FlightTimeHour 8.383113
FlightTimeMin 503.148975
dayOfWeek 3.000000
dtype: float64
"""
return self._query_compiler.median(numeric_only=numeric_only)

def max(self, numeric_only=True):
"""
Return the maximum value for each numeric column
Expand Down
16 changes: 13 additions & 3 deletions eland/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,11 @@ def std(self, query_compiler, numeric_only=True):
numeric_only=numeric_only,
)

def median(self, query_compiler, numeric_only=True):
return self._metric_aggs(
query_compiler, ("percentiles", "50.0"), numeric_only=numeric_only
)

def sum(self, query_compiler, numeric_only=True):
return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only)

Expand Down Expand Up @@ -275,9 +280,14 @@ def _metric_aggs(
)
else:
if isinstance(func, tuple):
results[field] = response["aggregations"][
func[0] + "_" + field
][func[1]]
if func[0] == "percentiles":
results[field] = response["aggregations"][
"percentiles_" + field
]["values"]["50.0"]
else:
results[field] = response["aggregations"][
func[0] + "_" + field
][func[1]]
else:
results[field] = response["aggregations"][field]["value"]

Expand Down
3 changes: 3 additions & 0 deletions eland/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,9 @@ def var(self, numeric_only=None):
def std(self, numeric_only=None):
return self._operations.std(self, numeric_only=numeric_only)

def median(self, numeric_only=None):
return self._operations.median(self, numeric_only=numeric_only)

def sum(self, numeric_only=None):
return self._operations.sum(self, numeric_only=numeric_only)

Expand Down
8 changes: 4 additions & 4 deletions eland/tests/dataframe/test_metrics_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# File called _pytest for PyCharm compatability
# File called _pytest for PyCharm compatibility

from pandas.util.testing import assert_series_equal, assert_almost_equal
from pandas.testing import assert_series_equal

from eland.tests.common import TestData


class TestDataFrameMetrics(TestData):
funcs = ["max", "min", "mean", "sum"]
extended_funcs = ["var", "std"]
extended_funcs = ["var", "std", "median"]

def test_flights_metrics(self):
pd_flights = self.pd_flights()
Expand All @@ -41,7 +41,7 @@ def test_flights_extended_metrics(self):
pd_metric = getattr(pd_flights, func)(numeric_only=True)
ed_metric = getattr(ed_flights, func)(numeric_only=True)

assert_almost_equal(pd_metric, ed_metric, check_less_precise=True)
assert_series_equal(pd_metric, ed_metric, check_less_precise=True)

def test_ecommerce_selected_non_numeric_source_fields(self):
# None of these are numeric
Expand Down