Skip to content

Commit

Permalink
fix(python): order .describe() output by percentiles
Browse files Browse the repository at this point in the history
  • Loading branch information
cmdlineluser committed Jul 31, 2023
1 parent 5fdb8d8 commit 17ba743
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 28 deletions.
22 changes: 17 additions & 5 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3887,10 +3887,10 @@ def describe(
│ mean ┆ 2.266667 ┆ 4.5 ┆ 0.666667 ┆ null ┆ null ┆ null │
│ std ┆ 1.101514 ┆ 0.707107 ┆ 0.57735 ┆ null ┆ null ┆ null │
│ min ┆ 1.0 ┆ 4.0 ┆ 0.0 ┆ b ┆ eur ┆ 2020-01-01 │
│ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │
│ median ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │
│ 25% ┆ 1.0 ┆ 4.0 ┆ null ┆ null ┆ null ┆ null │
│ 50% ┆ 2.8 ┆ 4.5 ┆ 1.0 ┆ null ┆ null ┆ null │
│ 75% ┆ 3.0 ┆ 5.0 ┆ null ┆ null ┆ null ┆ null │
│ max ┆ 3.0 ┆ 5.0 ┆ 1.0 ┆ c ┆ usd ┆ 2022-01-01 │
└────────────┴──────────┴──────────┴──────────┴──────┴──────┴────────────┘
"""
Expand All @@ -3900,7 +3900,7 @@ def describe(
raise ValueError("Percentiles must all be in the range [0, 1].")

# determine metrics (optional/additional percentiles)
metrics = ["count", "null_count", "mean", "std", "min", "max", "median"]
metrics = ["count", "null_count", "mean", "std", "min", "max", "50%"]
percentile_exprs = []
for p in percentiles or ():
percentile_exprs.append(F.all().quantile(p).prefix(f"{p}:"))
Expand All @@ -3924,6 +3924,17 @@ def describe(
df_metrics[(n * n_cols) : (n + 1) * n_cols] for n in range(0, len(metrics))
]

# sort percentiles, put `max` last
metric_idxs, sorted_metrics = zip(
*sorted(
enumerate(metrics),
key=lambda t: (
t[1] == "max",
int(t[1].rstrip("%")) if t[1].endswith("%") else float("nan"),
),
)
)

# cast by column type (numeric/bool -> float), (other -> string)
summary = dict(zip(self.columns, list(zip(*described))))
num_or_bool = NUMERIC_DTYPES | {Boolean}
Expand All @@ -3932,12 +3943,13 @@ def describe(
None
if (v is None or isinstance(v, dict))
else (float(v) if tp in num_or_bool else str(v))
for v in summary[c]
for idx in metric_idxs
for v in [summary[c][idx]]
]

# return results as a frame
df_summary = self.__class__(summary)
df_summary.insert_at_idx(0, pl.Series("describe", metrics))
df_summary.insert_at_idx(0, pl.Series("describe", sorted_metrics))
return df_summary

def find_idx_by_name(self, name: str) -> int:
Expand Down
19 changes: 14 additions & 5 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1333,8 +1333,8 @@ def describe(
│ std ┆ 1.581139 │
│ min ┆ 1.0 │
│ max ┆ 5.0 │
│ median ┆ 3.0 │
│ 25% ┆ 2.0 │
│ 50% ┆ 3.0 │
│ 75% ┆ 4.0 │
└────────────┴──────────┘
Expand Down Expand Up @@ -1370,11 +1370,20 @@ def describe(
"mean": s.mean(),
"std": s.std(),
"min": s.min(),
"max": s.max(),
"median": s.median(),
}
if percentiles:
stats.update({f"{p:.0%}": s.quantile(p) for p in percentiles})
pcts = {f"{p:.0%}": s.quantile(p) for p in percentiles}
pcts["50%"] = s.median()
stats.update(
{
k: pcts[k]
for k in sorted(pcts, key=lambda pct: int(pct.rstrip("%")))
}
)
else:
stats["50%"] = s.median()

stats["max"] = s.max()

elif self.is_boolean():
stats = {
Expand All @@ -1395,8 +1404,8 @@ def describe(
"count": str(self.len()),
"null_count": str(self.null_count()),
"min": str(self.dt.min()),
"50%": str(self.dt.median()),
"max": str(self.dt.max()),
"median": str(self.dt.median()),
}
else:
raise TypeError("This type is not supported")
Expand Down
38 changes: 24 additions & 14 deletions py-polars/tests/unit/dataframe/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -1155,27 +1155,37 @@ def test_describe() -> None:
"mean",
"std",
"min",
"max",
"median",
"25%",
"50%",
"75%",
"max",
],
"a": [
3.0,
0.0,
2.2666666666666666,
1.1015141094572205,
1.0,
1.0,
2.8,
3.0,
3.0,
],
"a": [3.0, 0.0, 2.2666667, 1.101514, 1.0, 3.0, 2.8, 1.0, 3.0],
"b": [3.0, 1.0, 4.5, 0.7071067811865476, 4.0, 5.0, 4.5, 4.0, 5.0],
"b": [3.0, 1.0, 4.5, 0.7071067811865476, 4.0, 4.0, 4.5, 5.0, 5.0],
"c": [
3.0,
0.0,
0.6666666666666666,
0.5773502588272095,
0.0,
1.0,
1.0,
None,
1.0,
None,
1.0,
],
"d": ["3", "1", None, None, "b", "c", None, None, None],
"d": ["3", "1", None, None, "b", None, None, None, "c"],
"e": ["3", "1", None, None, None, None, None, None, None],
"f": ["3", "0", None, None, "2020-01-01", "2022-01-01", None, None, None],
"f": ["3", "0", None, None, "2020-01-01", None, None, None, "2022-01-01"],
}
)
assert_frame_equal(df.describe(), expected)
Expand All @@ -1196,21 +1206,21 @@ def test_describe() -> None:
"mean",
"std",
"min",
"max",
"median",
"25%",
"50%",
"75%",
"max",
],
"numerical": [
4.0,
1.0,
1.3333333333333333,
0.5773502691896257,
1.0,
2.0,
1.0,
1.0,
2.0,
2.0,
],
"struct": ["4", "1", None, None, None, None, None, None, None],
"list": ["4", "1", None, None, None, None, None, None, None],
Expand All @@ -1223,8 +1233,8 @@ def test_describe() -> None:
("mean", 1.3333333333333333, None, None),
("std", 0.5773502691896257, None, None),
("min", 1.0, None, None),
("50%", 1.0, None, None),
("max", 2.0, None, None),
("median", 1.0, None, None),
]

described = df.describe(percentiles=(0.2, 0.4, 0.6, 0.8))
Expand All @@ -1240,12 +1250,12 @@ def test_describe() -> None:
("mean", 1.3333333333333333, None, None),
("std", 0.5773502691896257, None, None),
("min", 1.0, None, None),
("max", 2.0, None, None),
("median", 1.0, None, None),
("20%", 1.0, None, None),
("40%", 1.0, None, None),
("50%", 1.0, None, None),
("60%", 1.0, None, None),
("80%", 2.0, None, None),
("max", 2.0, None, None),
]


Expand Down
7 changes: 3 additions & 4 deletions py-polars/tests/unit/series/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1155,15 +1155,14 @@ def test_describe() -> None:
date_s = pl.Series([date(2021, 1, 1), date(2021, 1, 2), date(2021, 1, 3)])
empty_s = pl.Series(np.empty(0))

pl.DataFrame
assert dict(num_s.describe().rows()) == { # type: ignore[arg-type]
"count": 3.0,
"max": 3.0,
"mean": 2.0,
"min": 1.0,
"null_count": 0.0,
"std": 1.0,
"median": 2.0,
"50%": 2.0,
"25%": 1.0,
"75%": 3.0,
}
Expand All @@ -1174,7 +1173,7 @@ def test_describe() -> None:
"min": 1.3,
"null_count": 0.0,
"std": 3.8109491381194442,
"median": 4.6,
"50%": 4.6,
"25%": 1.3,
"75%": 8.9,
}
Expand All @@ -1192,7 +1191,7 @@ def test_describe() -> None:
"count": "3",
"max": "2021-01-03",
"min": "2021-01-01",
"median": "2021-01-02",
"50%": "2021-01-02",
"null_count": "0",
}

Expand Down

0 comments on commit 17ba743

Please sign in to comment.