diff --git a/doc/_docstrings/objects.Count.ipynb b/doc/_docstrings/objects.Count.ipynb new file mode 100644 index 0000000000..a509a68619 --- /dev/null +++ b/doc/_docstrings/objects.Count.ipynb @@ -0,0 +1,121 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "89113d6b-70b9-4ebe-9910-10a80eab246e", + "metadata": { + "tags": [ + "hide" + ] + }, + "outputs": [], + "source": [ + "import seaborn.objects as so\n", + "from seaborn import load_dataset\n", + "tips = load_dataset(\"tips\")" + ] + }, + { + "cell_type": "raw", + "id": "daf6ff78-df24-4541-ba72-73fb9eddb50d", + "metadata": {}, + "source": [ + "The transform counts distinct observations of the orientation variable defines a new variable on the opposite axis:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "390f2fd3-0596-40e3-b262-163b3a90d055", + "metadata": {}, + "outputs": [], + "source": [ + "so.Plot(tips, x=\"day\").add(so.Bar(), so.Count())" + ] + }, + { + "cell_type": "raw", + "id": "813fb4a5-db68-4b51-b236-5b5628ebba47", + "metadata": {}, + "source": [ + "When additional mapping variables are defined, they are also used to define groups:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76a4ae70-e914-4f54-b979-ce1b79374fc3", + "metadata": {}, + "outputs": [], + "source": [ + "so.Plot(tips, x=\"day\", color=\"sex\").add(so.Bar(), so.Count(), so.Dodge())" + ] + }, + { + "cell_type": "raw", + "id": "2973dee1-5aee-4768-846d-22d220faf170", + "metadata": {}, + "source": [ + "Unlike :class:`Hist`, numeric data are not binned before counting:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f94c5f0-680e-4d8a-a1c9-70876980dd1c", + "metadata": {}, + "outputs": [], + "source": [ + "so.Plot(tips, x=\"size\").add(so.Bar(), so.Count())" + ] + }, + { + "cell_type": "raw", + "id": "11acd5e6-f477-4eb1-b1d7-72f4582bca45", + "metadata": {}, + "source": [ + "When the `y` variable is defined, the counts are assigned to the `x` variable:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "924e0e35-210f-4f65-83b4-4aebe41ad264", + "metadata": {}, + "outputs": [], + "source": [ + "so.Plot(tips, y=\"size\").add(so.Bar(), so.Count())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0229fa39-b6dc-48da-9a25-31e25ed34ebc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py310", + "language": "python", + "name": "py310" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/api.rst b/doc/api.rst index 79442157b5..41240f0c34 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -86,6 +86,7 @@ Stat objects Agg Est + Count Hist Perc PolyFit diff --git a/doc/whatsnew/v0.12.1.rst b/doc/whatsnew/v0.12.1.rst index 5db8f8644a..2850eaa94e 100644 --- a/doc/whatsnew/v0.12.1.rst +++ b/doc/whatsnew/v0.12.1.rst @@ -8,6 +8,8 @@ v0.12.1 (Unreleased) - |Feature| Added the :class:`objects.Perc` stat (:pr:`3063`). +- |Feature| Added the :class:`objects.Count` stat (:pr:`3086`). + - |Feature| The :class:`objects.Band` and :class:`objects.Range` marks will now cover the full extent of the data if `min` / `max` variables are not explicitly assigned or added in a transform (:pr:`3056`). - |Enhancement| |Defaults| The :class:`objects.Jitter` move now applies a small amount of jitter by default (:pr:`3066`). diff --git a/seaborn/_stats/aggregation.py b/seaborn/_stats/aggregation.py index 0dffba6455..e1edeb2ece 100644 --- a/seaborn/_stats/aggregation.py +++ b/seaborn/_stats/aggregation.py @@ -35,7 +35,7 @@ def __call__( res = ( groupby .agg(data, {var: self.func}) - .dropna() + .dropna(subset=[var]) .reset_index(drop=True) ) return res @@ -86,7 +86,7 @@ def __call__( res = ( groupby .apply(data, self._process, var, engine) - .dropna(subset=["x", "y"]) + .dropna(subset=[var]) .reset_index(drop=True) ) diff --git a/seaborn/_stats/histogram.py b/seaborn/_stats/counting.py similarity index 88% rename from seaborn/_stats/histogram.py rename to seaborn/_stats/counting.py index 59b7b12f2e..84ec2be3c1 100644 --- a/seaborn/_stats/histogram.py +++ b/seaborn/_stats/counting.py @@ -1,11 +1,14 @@ from __future__ import annotations from dataclasses import dataclass from warnings import warn +from typing import ClassVar import numpy as np import pandas as pd +from pandas import DataFrame from seaborn._core.groupby import GroupBy +from seaborn._core.scales import Scale from seaborn._stats.base import Stat from typing import TYPE_CHECKING @@ -13,6 +16,37 @@ from numpy.typing import ArrayLike +@dataclass +class Count(Stat): + """ + Count distinct observations within groups. + + See Also + -------- + Hist : A more fully-featured transform including binning and/or normalization. + + Examples + -------- + .. include:: ../docstrings/objects.Count.rst + + """ + group_by_orient: ClassVar[bool] = True + + def __call__( + self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], + ) -> DataFrame: + + var = {"x": "y", "y": "x"}.get(orient) + data[var] = data[orient] + res = ( + groupby + .agg(data, {var: len}) + .dropna(subset=["x", "y"]) + .reset_index(drop=True) + ) + return res + + @dataclass class Hist(Stat): """ @@ -167,10 +201,12 @@ def _normalize(self, data): return data.assign(**{self.stat: hist}) - def __call__(self, data, groupby, orient, scales): + def __call__( + self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], + ) -> DataFrame: scale_type = scales[orient].__class__.__name__.lower() - grouping_vars = [v for v in data if v in groupby.order] + grouping_vars = [str(v) for v in data if v in groupby.order] if not grouping_vars or self.common_bins is True: bin_kws = self._define_bin_params(data, orient, scale_type) data = groupby.apply(data, self._eval, orient, bin_kws) diff --git a/seaborn/distributions.py b/seaborn/distributions.py index a4526e0edf..71e22ff036 100644 --- a/seaborn/distributions.py +++ b/seaborn/distributions.py @@ -20,7 +20,7 @@ # We have moved univariate histogram computation over to the new Hist class, # but still use the older Histogram for bivariate computation. from ._statistics import ECDF, Histogram, KDE -from ._stats.histogram import Hist +from ._stats.counting import Hist from .axisgrid import ( FacetGrid, diff --git a/seaborn/objects.py b/seaborn/objects.py index 8037cc53ab..d7931666fb 100644 --- a/seaborn/objects.py +++ b/seaborn/objects.py @@ -37,7 +37,7 @@ from seaborn._stats.base import Stat # noqa: F401 from seaborn._stats.aggregation import Agg, Est # noqa: F401 -from seaborn._stats.histogram import Hist # noqa: F401 +from seaborn._stats.counting import Count, Hist # noqa: F401 from seaborn._stats.order import Perc # noqa: F401 from seaborn._stats.regression import PolyFit # noqa: F401 diff --git a/tests/_stats/test_histogram.py b/tests/_stats/test_counting.py similarity index 86% rename from tests/_stats/test_histogram.py rename to tests/_stats/test_counting.py index f70865f0f2..44a0ae752b 100644 --- a/tests/_stats/test_histogram.py +++ b/tests/_stats/test_counting.py @@ -6,7 +6,45 @@ from numpy.testing import assert_array_equal from seaborn._core.groupby import GroupBy -from seaborn._stats.histogram import Hist +from seaborn._stats.counting import Hist, Count + + +class TestCount: + + @pytest.fixture + def df(self, rng): + + n = 30 + return pd.DataFrame(dict( + x=rng.uniform(0, 7, n).round(), + y=rng.normal(size=n), + color=rng.choice(["a", "b", "c"], n), + group=rng.choice(["x", "y"], n), + )) + + def get_groupby(self, df, orient): + + other = {"x": "y", "y": "x"}[orient] + cols = [c for c in df if c != other] + return GroupBy(cols) + + def test_single_grouper(self, df): + + ori = "x" + df = df[["x"]] + gb = self.get_groupby(df, ori) + res = Count()(df, gb, ori, {}) + expected = df.groupby("x").size() + assert_array_equal(res.sort_values("x")["y"], expected) + + def test_multiple_groupers(self, df): + + ori = "x" + df = df[["x", "group"]].sort_values("group") + gb = self.get_groupby(df, ori) + res = Count()(df, gb, ori, {}) + expected = df.groupby(["x", "group"]).size() + assert_array_equal(res.sort_values(["x", "group"])["y"], expected) class TestHist: