diff --git a/eland/ndframe.py b/eland/ndframe.py index d1c93722..587ed3ba 100644 --- a/eland/ndframe.py +++ b/eland/ndframe.py @@ -252,11 +252,95 @@ def min(self, numeric_only=True): return self._query_compiler.min(numeric_only=numeric_only) def var(self, numeric_only=True): + """ + Return variance for each numeric column + + Returns + ------- + pandas.Series + The value of the variance for each numeric column + + See Also + -------- + :pandas_api_docs:`pandas.DataFrame.var` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.var() # doctest: +SKIP + AvgTicketPrice 7.096185e+04 + Cancelled 1.119831e-01 + DistanceKilometers 2.096049e+07 + DistanceMiles 8.092892e+06 + FlightDelay 1.880825e-01 + FlightDelayMin 9.359209e+03 + FlightTimeHour 3.112545e+01 + FlightTimeMin 1.120516e+05 + dayOfWeek 3.761135e+00 + dtype: float64 + """ return self._query_compiler.var(numeric_only=numeric_only) def std(self, numeric_only=True): + """ + Return standard deviation for each numeric column + + Returns + ------- + pandas.Series + The value of the standard deviation for each numeric column + + See Also + -------- + :pandas_api_docs:`pandas.DataFrame.std` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.std() # doctest: +SKIP + AvgTicketPrice 266.386661 + Cancelled 0.334639 + DistanceKilometers 4578.263193 + DistanceMiles 2844.800855 + FlightDelay 0.433685 + FlightDelayMin 96.743006 + FlightTimeHour 5.579019 + FlightTimeMin 334.741135 + dayOfWeek 1.939365 + dtype: float64 + """ return self._query_compiler.std(numeric_only=numeric_only) + def median(self, numeric_only=True): + """ + Return the median value for each numeric column + + Returns + ------- + pandas.Series + median value for each numeric column + + See Also + -------- + :pandas_api_docs:`pandas.DataFrame.median` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.median() # doctest: +SKIP + AvgTicketPrice 640.387285 + Cancelled 0.000000 + DistanceKilometers 7612.072403 + DistanceMiles 4729.922470 + FlightDelay 0.000000 + FlightDelayMin 0.000000 + FlightTimeHour 8.383113 + FlightTimeMin 503.148975 + dayOfWeek 3.000000 + dtype: float64 + """ + return self._query_compiler.median(numeric_only=numeric_only) + def max(self, numeric_only=True): """ Return the maximum value for each numeric column diff --git a/eland/operations.py b/eland/operations.py index a650d461..8ae4f96e 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -135,6 +135,11 @@ def std(self, query_compiler, numeric_only=True): numeric_only=numeric_only, ) + def median(self, query_compiler, numeric_only=True): + return self._metric_aggs( + query_compiler, ("percentiles", "50.0"), numeric_only=numeric_only + ) + def sum(self, query_compiler, numeric_only=True): return self._metric_aggs(query_compiler, "sum", numeric_only=numeric_only) @@ -275,9 +280,14 @@ def _metric_aggs( ) else: if isinstance(func, tuple): - results[field] = response["aggregations"][ - func[0] + "_" + field - ][func[1]] + if func[0] == "percentiles": + results[field] = response["aggregations"][ + "percentiles_" + field + ]["values"]["50.0"] + else: + results[field] = response["aggregations"][ + func[0] + "_" + field + ][func[1]] else: results[field] = response["aggregations"][field]["value"] diff --git a/eland/query_compiler.py b/eland/query_compiler.py index 011d5dc8..6b8a5f32 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -469,6 +469,9 @@ def var(self, numeric_only=None): def std(self, numeric_only=None): return self._operations.std(self, numeric_only=numeric_only) + def median(self, numeric_only=None): + return self._operations.median(self, numeric_only=numeric_only) + def sum(self, numeric_only=None): return self._operations.sum(self, numeric_only=numeric_only) diff --git a/eland/tests/dataframe/test_metrics_pytest.py b/eland/tests/dataframe/test_metrics_pytest.py index bb55cc1c..65561e91 100644 --- a/eland/tests/dataframe/test_metrics_pytest.py +++ b/eland/tests/dataframe/test_metrics_pytest.py @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -# File called _pytest for PyCharm compatability +# File called _pytest for PyCharm compatibility -from pandas.util.testing import assert_series_equal, assert_almost_equal +from pandas.testing import assert_series_equal from eland.tests.common import TestData class TestDataFrameMetrics(TestData): funcs = ["max", "min", "mean", "sum"] - extended_funcs = ["var", "std"] + extended_funcs = ["var", "std", "median"] def test_flights_metrics(self): pd_flights = self.pd_flights() @@ -41,7 +41,7 @@ def test_flights_extended_metrics(self): pd_metric = getattr(pd_flights, func)(numeric_only=True) ed_metric = getattr(ed_flights, func)(numeric_only=True) - assert_almost_equal(pd_metric, ed_metric, check_less_precise=True) + assert_series_equal(pd_metric, ed_metric, check_less_precise=True) def test_ecommerce_selected_non_numeric_source_fields(self): # None of these are numeric