From f0a69dac74433b57090dc923736093af36a97920 Mon Sep 17 00:00:00 2001 From: Michael Waskom Date: Sun, 23 Oct 2022 17:00:20 -0400 Subject: [PATCH 1/6] Add KDE stat --- seaborn/_stats/base.py | 13 ++- seaborn/_stats/counting.py | 24 ++--- seaborn/_stats/density.py | 212 +++++++++++++++++++++++++++++++++++++ seaborn/external/kde.py | 3 +- seaborn/objects.py | 1 + 5 files changed, 232 insertions(+), 21 deletions(-) create mode 100644 seaborn/_stats/density.py diff --git a/seaborn/_stats/base.py b/seaborn/_stats/base.py index 6584a8e61a..3ced7b1069 100644 --- a/seaborn/_stats/base.py +++ b/seaborn/_stats/base.py @@ -3,6 +3,7 @@ from collections.abc import Iterable from dataclasses import dataclass from typing import ClassVar, Any +import warnings from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -29,7 +30,7 @@ class Stat: # value on the orient axis, but we would not in the latter case. group_by_orient: ClassVar[bool] = False - def _check_param_one_of(self, param: Any, options: Iterable[Any]) -> None: + def _check_param_one_of(self, param: str, options: Iterable[Any]) -> None: """Raise when parameter value is not one of a specified set.""" value = getattr(self, param) if value not in options: @@ -41,6 +42,16 @@ def _check_param_one_of(self, param: Any, options: Iterable[Any]) -> None: ]) raise ValueError(err) + def _check_grouping_vars(self, param: str, data_vars: list[str]) -> None: + """Warn if vars are named in parameter without being present in the data.""" + param_vars = getattr(self, param) + undefined = set(param_vars) - set(data_vars) + if undefined: + param = f"{self.__class__.__name__}.{param}" + names = ", ".join(f"{x!r}" for x in undefined) + msg = f"Undefined variables(s) passed for {param}: {names}." + warnings.warn(msg, stacklevel=2) + def __call__( self, data: DataFrame, diff --git a/seaborn/_stats/counting.py b/seaborn/_stats/counting.py index 84ec2be3c1..3faac5fb36 100644 --- a/seaborn/_stats/counting.py +++ b/seaborn/_stats/counting.py @@ -1,6 +1,5 @@ from __future__ import annotations from dataclasses import dataclass -from warnings import warn from typing import ClassVar import numpy as np @@ -86,7 +85,6 @@ class Hist(Stat): Notes ----- - The choice of bins for computing and plotting a histogram can exert substantial influence on the insights that one is able to draw from the visualization. If the bins are too large, they may erase important features. @@ -100,7 +98,6 @@ class Hist(Stat): by setting the total number of bins to use, the width of each bin, or the specific locations where the bins should break. - Examples -------- .. include:: ../docstrings/objects.Hist.rst @@ -215,12 +212,8 @@ def __call__( bin_groupby = GroupBy(grouping_vars) else: bin_groupby = GroupBy(self.common_bins) - undefined = set(self.common_bins) - set(grouping_vars) - if undefined: - param = f"{self.__class__.__name__}.common_bins" - names = ", ".join(f"{x!r}" for x in undefined) - msg = f"Undefined variables(s) passed to `{param}`: {names}." - warn(msg) + self._check_grouping_vars("common_bins", grouping_vars) + data = bin_groupby.apply( data, self._get_bins_and_eval, orient, groupby, scale_type, ) @@ -229,16 +222,11 @@ def __call__( data = self._normalize(data) else: if self.common_norm is False: - norm_grouper = grouping_vars + norm_groupby = GroupBy(grouping_vars) else: - norm_grouper = self.common_norm - undefined = set(self.common_norm) - set(grouping_vars) - if undefined: - param = f"{self.__class__.__name__}.common_norm" - names = ", ".join(f"{x!r}" for x in undefined) - msg = f"Undefined variables(s) passed to `{param}`: {names}." - warn(msg) - data = GroupBy(norm_grouper).apply(data, self._normalize) + norm_groupby = GroupBy(self.common_norm) + self._check_grouping_vars("common_norm", grouping_vars) + data = norm_groupby.apply(data, self._normalize) other = {"x": "y", "y": "x"}[orient] return data.assign(**{other: data[self.stat]}) diff --git a/seaborn/_stats/density.py b/seaborn/_stats/density.py new file mode 100644 index 0000000000..28c451f493 --- /dev/null +++ b/seaborn/_stats/density.py @@ -0,0 +1,212 @@ +from __future__ import annotations +from dataclasses import dataclass +from typing import Any, Callable + +import numpy as np +from numpy import ndarray +import pandas as pd +from pandas import DataFrame +try: + from scipy.stats import gaussian_kde + _no_scipy = False +except ImportError: + from .external.kde import gaussian_kde + _no_scipy = True + +from seaborn._core.groupby import GroupBy +from seaborn._core.scales import Scale +from seaborn._stats.base import Stat + + +@dataclass +class KDE(Stat): + """ + Compute a univariate kernel density estimate. + + Parameters + ---------- + bw_adjust : float + Factor that multiplicatively scales the value chosen using + ``bw_method``. Increasing will make the curve smoother. See Notes. + bw_method : string, scalar, or callable + Method for determining the smoothing bandwidth to use; passed to + :class:`scipy.stats.gaussian_kde`. + common_norm : bool or list of variables + If `True`, normalize so that the sum of all curves sums to 1. + If `False`, normalize each curve independently. If a list, defines + variable(s) to group by and normalize within. + common_grid : bool or list of variables + If `True`, all curves will share the same evaluation grid. + If `False`, each evaluation grid is independent. If a list, defines + variable(s) to group by and share a grid within. + cut : float + Factor, multiplied by the smoothing bandwidth, that determines how far + the evaluation grid extends past the extreme datapoints. When set to 0, + the curve is trunacted at the data limits. + gridsize : int or None + Number of points in the evaluation grid. If None, the + density is evaluated at the original datapoints. + cumulative : bool + If True, estimate a cumulative distribution function. Requires scipy. + + Notes + ----- + The *bandwidth*, or standard deviation of the smoothing kernel, is an + important parameter. Misspecification of the bandwidth can produce a + distorted representation of the data. Much like the choice of bin width in a + histogram, an over-smoothed curve can erase true features of a distribution, + while an under-smoothed curve can create false features out of random + variability. The rule-of-thumb that sets the default bandwidth works best + when the true distribution is smooth, unimodal, and roughly bell-shaped. It + is always a good idea to check the default behavior by using `bw_adjust` to + increase or decrease the amount of smoothing. + + Because the smoothing is performed with a Gaussian kernel, the estimated + density curve can extend to values that do not make sense for a particular + dataset. For example, the curve may be drawn over negative values when + smoothing data that are naturally positive. The `cut` parameter can be used + to control the extent of the curve, but datasets that have many observations + close to a natural boundary may be better served by a different transform. + + Similar considerations apply when a dataset is naturally discrete or "spiky" + (containing many repeated observations of the same value). KDEs will always + produce a smooth curve, which would be misleading in these situations. + + The units on the density axis are a common source of confusion. While kernel + density estimation produces a probability distribution, the height of the curve + at each point gives a density, not a probability. A probability can be obtained + only by integrating the density across a range. The curve is normalized so + that the integral over all possible values is 1, meaning that the scale of + the density axis depends on the data values. + + Examples + -------- + .. include:: ../docstrings/objects.KDE.rst + + """ + bw_adjust: float = 1 + bw_method: str | float | Callable[[gaussian_kde], float] = "scott" + common_norm: bool | list[str] = True + common_grid: bool | list[str] = True + cut: float = 3 + gridsize: int | None = 200 + cumulative: bool = False + + def __post_init__(self): + + if self.cumulative and _no_scipy: + raise RuntimeError("Cumulative KDE evaluation requires scipy") + + def _check_var_list_or_boolean(self, param: str, grouping_vars: Any) -> None: + """Do input checks on grouping parameters.""" + value = getattr(self, param) + if not ( + isinstance(value, bool) + or (isinstance(value, list) and all(isinstance(v, str) for v in value)) + ): + param_name = f"{self.__class__.__name__}.{param}" + raise TypeError(f"{param_name} must be a boolean or list of strings.") + self._check_grouping_vars(param, grouping_vars) + + def _fit(self, data: DataFrame, orient: str) -> gaussian_kde: + """Fit and return a KDE object.""" + # TODO need to handle singular data + + fit_kws: dict[str, Any] = {"bw_method": self.bw_method} + if "weight" in data: + fit_kws["weights"] = data["weight"] + kde = gaussian_kde(data[orient], **fit_kws) + kde.set_bandwidth(kde.factor * self.bw_adjust) + + return kde + + def _get_support(self, data: DataFrame, orient: str) -> ndarray: + """Define the grid that the KDE will be evaluated on.""" + if self.gridsize is None: + return data[orient].to_numpy() + + kde = self._fit(data, orient) + bw = np.sqrt(kde.covariance.squeeze()) + gridmin = data[orient].min() - bw * self.cut + gridmax = data[orient].max() + bw * self.cut + return np.linspace(gridmin, gridmax, self.gridsize) + + def _fit_and_evaluate( + self, data: DataFrame, orient: str, support: ndarray + ) -> DataFrame: + """Transform single group by fitting a KDE and evaluating on a support grid.""" + empty = pd.DataFrame(columns=[orient, "density", "weight"], dtype=float) + if len(data) < 2: + return empty + try: + kde = self._fit(data, orient) + except np.linalg.LinAlgError: + return empty + if self.cumulative: + s_0 = support[0] + density = np.array([kde.integrate_box_1d(s_0, s_i) for s_i in support]) + else: + density = kde(support) + weight = data["weight"].sum() + return pd.DataFrame({orient: support, "density": density, "weight": weight}) + + def _transform( + self, data: DataFrame, orient: str, grouping_vars: list[str] + ) -> DataFrame: + """Transform multiple groups by fitting KDEs and evaluating.""" + empty = pd.DataFrame(columns=[*data.columns, "density"], dtype=float) + if len(data) < 2: + return empty + try: + support = self._get_support(data, orient) + except np.linalg.LinAlgError: + return empty + grouping_vars = [x for x in grouping_vars if data[x].nunique() > 1] + if not grouping_vars: + return self._fit_and_evaluate(data, orient, support) + groupby = GroupBy(grouping_vars) + return groupby.apply(data, self._fit_and_evaluate, orient, support) + + def __call__( + self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], + ) -> DataFrame: + + if "weight" not in data: + data = data.assign(weight=1) + + # Transform each group separately + grouping_vars = [str(v) for v in data if v in groupby.order] + if not grouping_vars or self.common_grid is True: + res = self._transform(data, orient, grouping_vars) + else: + if self.common_grid is False: + grid_vars = grouping_vars + else: + self._check_var_list_or_boolean("common_grid", grouping_vars) + grid_vars = [v for v in self.common_grid if v in grouping_vars] + + res = ( + GroupBy(grid_vars) + .apply(data, self._transform, orient, grouping_vars) + ) + + # Normalize, potentially within groups + if not grouping_vars or self.common_norm is True: + res = res.assign(group_weight=data["weight"].sum()) + else: + if self.common_norm is False: + norm_vars = grouping_vars + else: + self._check_var_list_or_boolean("common_norm", grouping_vars) + norm_vars = [v for v in self.common_norm if v in grouping_vars] + + res = res.join( + data.groupby(norm_vars)["weight"].sum().rename("group_weight"), + on=norm_vars, + ) + + value_var = {"x": "y", "y": "x"}[orient] + res["density"] *= res.eval("weight / group_weight") + res[value_var] = res["density"] + + return res diff --git a/seaborn/external/kde.py b/seaborn/external/kde.py index 34bb1ad984..6add4e1912 100644 --- a/seaborn/external/kde.py +++ b/seaborn/external/kde.py @@ -71,8 +71,7 @@ import numpy as np from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, dot, exp, pi, - sqrt, ravel, power, atleast_1d, squeeze, sum, transpose, - ones, cov) + sqrt, power, atleast_1d, sum, ones, cov) from numpy import linalg diff --git a/seaborn/objects.py b/seaborn/objects.py index d7931666fb..27e4c38155 100644 --- a/seaborn/objects.py +++ b/seaborn/objects.py @@ -38,6 +38,7 @@ from seaborn._stats.base import Stat # noqa: F401 from seaborn._stats.aggregation import Agg, Est # noqa: F401 from seaborn._stats.counting import Count, Hist # noqa: F401 +from seaborn._stats.density import KDE # noqa: F401 from seaborn._stats.order import Perc # noqa: F401 from seaborn._stats.regression import PolyFit # noqa: F401 From 5e1b50c30e0984f90af7d702980fad7b02f6af01 Mon Sep 17 00:00:00 2001 From: Michael Waskom Date: Sun, 23 Oct 2022 22:55:32 -0400 Subject: [PATCH 2/6] Add some tests for KDE --- doc/api.rst | 1 + seaborn/_stats/density.py | 15 ++-- tests/_stats/test_density.py | 148 +++++++++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+), 6 deletions(-) create mode 100644 tests/_stats/test_density.py diff --git a/doc/api.rst b/doc/api.rst index 41240f0c34..c82bdcaa4a 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -88,6 +88,7 @@ Stat objects Est Count Hist + KDE Perc PolyFit diff --git a/seaborn/_stats/density.py b/seaborn/_stats/density.py index 28c451f493..2a5e2610eb 100644 --- a/seaborn/_stats/density.py +++ b/seaborn/_stats/density.py @@ -39,13 +39,13 @@ class KDE(Stat): If `True`, all curves will share the same evaluation grid. If `False`, each evaluation grid is independent. If a list, defines variable(s) to group by and share a grid within. + gridsize : int or None + Number of points in the evaluation grid. If None, the + density is evaluated at the original datapoints. cut : float Factor, multiplied by the smoothing bandwidth, that determines how far the evaluation grid extends past the extreme datapoints. When set to 0, the curve is trunacted at the data limits. - gridsize : int or None - Number of points in the evaluation grid. If None, the - density is evaluated at the original datapoints. cumulative : bool If True, estimate a cumulative distribution function. Requires scipy. @@ -88,8 +88,8 @@ class KDE(Stat): bw_method: str | float | Callable[[gaussian_kde], float] = "scott" common_norm: bool | list[str] = True common_grid: bool | list[str] = True - cut: float = 3 gridsize: int | None = 200 + cut: float = 3 cumulative: bool = False def __post_init__(self): @@ -135,20 +135,22 @@ def _fit_and_evaluate( self, data: DataFrame, orient: str, support: ndarray ) -> DataFrame: """Transform single group by fitting a KDE and evaluating on a support grid.""" - empty = pd.DataFrame(columns=[orient, "density", "weight"], dtype=float) + empty = pd.DataFrame(columns=[orient, "weight", "density"], dtype=float) if len(data) < 2: return empty try: kde = self._fit(data, orient) except np.linalg.LinAlgError: return empty + if self.cumulative: s_0 = support[0] density = np.array([kde.integrate_box_1d(s_0, s_i) for s_i in support]) else: density = kde(support) + weight = data["weight"].sum() - return pd.DataFrame({orient: support, "density": density, "weight": weight}) + return pd.DataFrame({orient: support, "weight": weight, "density": density}) def _transform( self, data: DataFrame, orient: str, grouping_vars: list[str] @@ -161,6 +163,7 @@ def _transform( support = self._get_support(data, orient) except np.linalg.LinAlgError: return empty + grouping_vars = [x for x in grouping_vars if data[x].nunique() > 1] if not grouping_vars: return self._fit_and_evaluate(data, orient, support) diff --git a/tests/_stats/test_density.py b/tests/_stats/test_density.py new file mode 100644 index 0000000000..dc2d217c1d --- /dev/null +++ b/tests/_stats/test_density.py @@ -0,0 +1,148 @@ +import numpy as np +import pandas as pd + +import pytest +from numpy.testing import assert_array_equal, assert_array_almost_equal + +from seaborn._core.groupby import GroupBy +from seaborn._stats.density import KDE, _no_scipy + + +class TestKDE: + + @pytest.fixture + def df(self, rng): + + n = 100 + return pd.DataFrame(dict( + x=rng.uniform(0, 7, n).round(), + y=rng.normal(size=n), + color=rng.choice(["a", "b", "c"], n), + alpha=rng.choice(["x", "y"], n), + )) + + def get_groupby(self, df, orient): + + cols = [c for c in df if c != orient] + return GroupBy([*cols, "group"]) + + def integrate(self, y, x): + y = np.asarray(y) + x = np.asarray(x) + dx = np.diff(x) + return (dx * y[:-1] + dx * y[1:]).sum() / 2 + + @pytest.mark.parametrize("ori", ["x", "y"]) + def test_columns(self, df, ori): + + df = df[[ori, "alpha"]] + gb = self.get_groupby(df, ori) + res = KDE()(df, gb, ori, {}) + other = {"x": "y", "y": "x"}[ori] + expected = [ori, "alpha", "weight", "density", "group_weight", other] + assert list(res.columns) == expected + + @pytest.mark.parametrize("gridsize", [20, 30, None]) + def test_gridsize(self, df, gridsize): + + ori = "y" + df = df[[ori]] + gb = self.get_groupby(df, ori) + res = KDE(gridsize=gridsize)(df, gb, ori, {}) + if gridsize is None: + assert_array_equal(res[ori], df[ori]) + else: + assert len(res) == gridsize + + @pytest.mark.parametrize("cut", [1, 2]) + def test_cut(self, df, cut): + + ori = "y" + df = df[[ori]] + gb = self.get_groupby(df, ori) + res = KDE(cut=cut, bw_method=1)(df, gb, ori, {}) + + vals = df[ori] + bw = vals.std() + assert res[ori].min() == pytest.approx(vals.min() - bw * cut, abs=1e-2) + assert res[ori].max() == pytest.approx(vals.max() + bw * cut, abs=1e-2) + + @pytest.mark.parametrize("common_grid", [True, False]) + def test_common_grid(self, df, common_grid): + + ori = "y" + df = df[[ori, "alpha"]] + gb = self.get_groupby(df, ori) + res = KDE(common_grid=common_grid)(df, gb, ori, {}) + + vals = df["alpha"].unique() + a = res.loc[res["alpha"] == vals[0], ori].to_numpy() + b = res.loc[res["alpha"] == vals[1], ori].to_numpy() + if common_grid: + assert_array_equal(a, b) + else: + assert np.not_equal(a, b).all() + + @pytest.mark.parametrize("common_norm", [True, False]) + def test_common_norm(self, df, common_norm): + + ori = "y" + df = df[[ori, "alpha"]] + gb = self.get_groupby(df, ori) + res = KDE(common_norm=common_norm)(df, gb, ori, {}) + + areas = ( + res.groupby("alpha") + .apply(lambda x: self.integrate(x["density"], x[ori])) + ) + + if common_norm: + assert areas.sum() == pytest.approx(1, abs=1e-3) + else: + assert_array_almost_equal(areas, [1, 1], decimal=3) + + def test_bw_adjust(self, df): + + ori = "y" + df = df[[ori]] + gb = self.get_groupby(df, ori) + res1 = KDE(bw_adjust=0.5)(df, gb, ori, {}) + res2 = KDE(bw_adjust=2.0)(df, gb, ori, {}) + + mad1 = res1["density"].diff().abs().mean() + mad2 = res2["density"].diff().abs().mean() + assert mad1 > mad2 + + def test_bw_method_scalar(self, df): + + ori = "y" + df = df[[ori]] + gb = self.get_groupby(df, ori) + res1 = KDE(bw_method=0.5)(df, gb, ori, {}) + res2 = KDE(bw_method=2.0)(df, gb, ori, {}) + + mad1 = res1["density"].diff().abs().mean() + mad2 = res2["density"].diff().abs().mean() + assert mad1 > mad2 + + @pytest.mark.skipif(_no_scipy, reason="KDE.cumulative requires scipy") + @pytest.mark.parametrize("common_norm", [True, False]) + def test_cumulative(self, df, common_norm): + + ori = "y" + df = df[[ori, "alpha"]] + gb = self.get_groupby(df, ori) + + res = KDE(cumulative=True, common_norm=common_norm)(df, gb, ori, {}) + + for _, group_res in res.groupby("alpha"): + assert (group_res["density"].diff().dropna() >= 0).all() + if not common_norm: + assert group_res["density"].max() == pytest.approx(1, abs=1e-3) + + def test_cumulative_requires_scipy(self): + + if _no_scipy: + err = "Cumulative KDE evaluation requires scipy" + with pytest.raises(RuntimeError, match=err): + KDE(cumulative=True) From 0009e7c20ba8c82518bf1c9322cd4679cce33a9d Mon Sep 17 00:00:00 2001 From: Michael Waskom Date: Mon, 24 Oct 2022 08:02:36 -0400 Subject: [PATCH 3/6] Additional testing coverage --- Makefile | 2 +- seaborn/_stats/base.py | 8 +++-- seaborn/_stats/density.py | 51 +++++++++++++++--------------- tests/_stats/test_counting.py | 4 +-- tests/_stats/test_density.py | 58 +++++++++++++++++++++++++++++++++-- 5 files changed, 89 insertions(+), 34 deletions(-) diff --git a/Makefile b/Makefile index 0d2b7247a1..86b406ee55 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ export SHELL := /bin/bash test: - pytest -n auto --cov=seaborn --cov=tests --cov-config=.coveragerc tests + pytest -n auto --cov=seaborn --cov=tests --cov-config=setup.cfg tests lint: flake8 seaborn diff --git a/seaborn/_stats/base.py b/seaborn/_stats/base.py index 3ced7b1069..b80b228165 100644 --- a/seaborn/_stats/base.py +++ b/seaborn/_stats/base.py @@ -42,15 +42,17 @@ def _check_param_one_of(self, param: str, options: Iterable[Any]) -> None: ]) raise ValueError(err) - def _check_grouping_vars(self, param: str, data_vars: list[str]) -> None: + def _check_grouping_vars( + self, param: str, data_vars: list[str], stacklevel: int = 2, + ) -> None: """Warn if vars are named in parameter without being present in the data.""" param_vars = getattr(self, param) undefined = set(param_vars) - set(data_vars) if undefined: param = f"{self.__class__.__name__}.{param}" names = ", ".join(f"{x!r}" for x in undefined) - msg = f"Undefined variables(s) passed for {param}: {names}." - warnings.warn(msg, stacklevel=2) + msg = f"Undefined variable(s) passed for {param}: {names}." + warnings.warn(msg, stacklevel=stacklevel) def __call__( self, diff --git a/seaborn/_stats/density.py b/seaborn/_stats/density.py index 2a5e2610eb..733601c2e9 100644 --- a/seaborn/_stats/density.py +++ b/seaborn/_stats/density.py @@ -29,8 +29,8 @@ class KDE(Stat): Factor that multiplicatively scales the value chosen using ``bw_method``. Increasing will make the curve smoother. See Notes. bw_method : string, scalar, or callable - Method for determining the smoothing bandwidth to use; passed to - :class:`scipy.stats.gaussian_kde`. + Method for determining the smoothing bandwidth to use. Passed directly + to :class:`scipy.stats.gaussian_kde`; see there for options. common_norm : bool or list of variables If `True`, normalize so that the sum of all curves sums to 1. If `False`, normalize each curve independently. If a list, defines @@ -40,37 +40,34 @@ class KDE(Stat): If `False`, each evaluation grid is independent. If a list, defines variable(s) to group by and share a grid within. gridsize : int or None - Number of points in the evaluation grid. If None, the - density is evaluated at the original datapoints. + Number of points in the evaluation grid. If None, the density is + evaluated at the original datapoints. cut : float - Factor, multiplied by the smoothing bandwidth, that determines how far + Factor, multiplied by the kernel bandwidth, that determines how far the evaluation grid extends past the extreme datapoints. When set to 0, - the curve is trunacted at the data limits. + the curve is truncated at the data limits. cumulative : bool If True, estimate a cumulative distribution function. Requires scipy. Notes ----- The *bandwidth*, or standard deviation of the smoothing kernel, is an - important parameter. Misspecification of the bandwidth can produce a - distorted representation of the data. Much like the choice of bin width in a - histogram, an over-smoothed curve can erase true features of a distribution, - while an under-smoothed curve can create false features out of random - variability. The rule-of-thumb that sets the default bandwidth works best - when the true distribution is smooth, unimodal, and roughly bell-shaped. It - is always a good idea to check the default behavior by using `bw_adjust` to - increase or decrease the amount of smoothing. + important parameter. Much like histogram bin width, using the wrong + bandwidth can produce a distorted representation. Over-smoothing can erase + true features, while under-smoothing can create false ones. The default + uses a rule-of-thumb that works best for distributions that are roughly + bell-shaped. It is a good idea to check the default by varying `bw_adjust`. Because the smoothing is performed with a Gaussian kernel, the estimated - density curve can extend to values that do not make sense for a particular - dataset. For example, the curve may be drawn over negative values when - smoothing data that are naturally positive. The `cut` parameter can be used - to control the extent of the curve, but datasets that have many observations - close to a natural boundary may be better served by a different transform. + density curve can extend to values that may not make sense. For example, the + curve may be drawn over negative values when data that are naturally + positive. The `cut` parameter can be used to control the evaluation range, + but datasets that have many observations close to a natural boundary may be + better served by a different method. - Similar considerations apply when a dataset is naturally discrete or "spiky" + Similar distortions may arise when a dataset is naturally discrete or "spiky" (containing many repeated observations of the same value). KDEs will always - produce a smooth curve, which would be misleading in these situations. + produce a smooth curve, which could be misleading. The units on the density axis are a common source of confusion. While kernel density estimation produces a probability distribution, the height of the curve @@ -79,6 +76,8 @@ class KDE(Stat): that the integral over all possible values is 1, meaning that the scale of the density axis depends on the data values. + If scipy is installed, its cython-accelerated implementation will be used. + Examples -------- .. include:: ../docstrings/objects.KDE.rst @@ -106,7 +105,7 @@ def _check_var_list_or_boolean(self, param: str, grouping_vars: Any) -> None: ): param_name = f"{self.__class__.__name__}.{param}" raise TypeError(f"{param_name} must be a boolean or list of strings.") - self._check_grouping_vars(param, grouping_vars) + self._check_grouping_vars(param, grouping_vars, stacklevel=3) def _fit(self, data: DataFrame, orient: str) -> gaussian_kde: """Fit and return a KDE object.""" @@ -176,6 +175,7 @@ def __call__( if "weight" not in data: data = data.assign(weight=1) + data = data.dropna(subset=[orient, "weight"]) # Transform each group separately grouping_vars = [str(v) for v in data if v in groupby.order] @@ -208,8 +208,7 @@ def __call__( on=norm_vars, ) - value_var = {"x": "y", "y": "x"}[orient] res["density"] *= res.eval("weight / group_weight") - res[value_var] = res["density"] - - return res + value = {"x": "y", "y": "x"}[orient] + res[value] = res["density"] + return res.drop(["weight", "group_weight"], axis=1) diff --git a/tests/_stats/test_counting.py b/tests/_stats/test_counting.py index 44a0ae752b..7656654492 100644 --- a/tests/_stats/test_counting.py +++ b/tests/_stats/test_counting.py @@ -206,7 +206,7 @@ def test_common_norm_subset(self, long_df, triple_args): def test_common_norm_warning(self, long_df, triple_args): h = Hist(common_norm=["b"]) - with pytest.warns(UserWarning, match="Undefined variable(s)"): + with pytest.warns(UserWarning, match=r"Undefined variable\(s\)"): h(long_df, *triple_args) def test_common_bins_default(self, long_df, triple_args): @@ -239,7 +239,7 @@ def test_common_bins_subset(self, long_df, triple_args): def test_common_bins_warning(self, long_df, triple_args): h = Hist(common_bins=["b"]) - with pytest.warns(UserWarning, match="Undefined variable(s)"): + with pytest.warns(UserWarning, match=r"Undefined variable\(s\)"): h(long_df, *triple_args) def test_histogram_single(self, long_df, single_args): diff --git a/tests/_stats/test_density.py b/tests/_stats/test_density.py index dc2d217c1d..ead182acdf 100644 --- a/tests/_stats/test_density.py +++ b/tests/_stats/test_density.py @@ -39,7 +39,7 @@ def test_columns(self, df, ori): gb = self.get_groupby(df, ori) res = KDE()(df, gb, ori, {}) other = {"x": "y", "y": "x"}[ori] - expected = [ori, "alpha", "weight", "density", "group_weight", other] + expected = [ori, "alpha", "density", other] assert list(res.columns) == expected @pytest.mark.parametrize("gridsize", [20, 30, None]) @@ -101,6 +101,37 @@ def test_common_norm(self, df, common_norm): else: assert_array_almost_equal(areas, [1, 1], decimal=3) + def test_common_norm_variables(self, df): + + ori = "y" + df = df[[ori, "alpha", "color"]] + gb = self.get_groupby(df, ori) + res = KDE(common_norm=["alpha"])(df, gb, ori, {}) + + def integrate_by_color_and_sum(x): + return ( + x.groupby("color") + .apply(lambda y: self.integrate(y["density"], y[ori])) + .sum() + ) + + areas = res.groupby("alpha").apply(integrate_by_color_and_sum) + assert_array_almost_equal(areas, [1, 1], decimal=3) + + @pytest.mark.parametrize("param", ["norm", "grid"]) + def test_common_input_checks(self, df, param): + + ori = "y" + df = df[[ori, "alpha"]] + gb = self.get_groupby(df, ori) + msg = rf"Undefined variable\(s\) passed for KDE.common_{param}" + with pytest.warns(UserWarning, match=msg): + KDE(**{f"common_{param}": ["color", "alpha"]})(df, gb, ori, {}) + + msg = f"KDE.common_{param} must be a boolean or list of strings" + with pytest.raises(TypeError, match=msg): + KDE(**{f"common_{param}": "alpha"})(df, gb, ori, {}) + def test_bw_adjust(self, df): ori = "y" @@ -132,7 +163,6 @@ def test_cumulative(self, df, common_norm): ori = "y" df = df[[ori, "alpha"]] gb = self.get_groupby(df, ori) - res = KDE(cumulative=True, common_norm=common_norm)(df, gb, ori, {}) for _, group_res in res.groupby("alpha"): @@ -146,3 +176,27 @@ def test_cumulative_requires_scipy(self): err = "Cumulative KDE evaluation requires scipy" with pytest.raises(RuntimeError, match=err): KDE(cumulative=True) + + @pytest.mark.parametrize("vals", [[], [1], [1] * 5, [1929245168.06679] * 18]) + def test_singular(self, df, vals): + + df1 = pd.DataFrame({"y": vals, "alpha": ["z"] * len(vals)}) + gb = self.get_groupby(df1, "y") + res = KDE()(df1, gb, "y", {}) + assert res.empty + + df2 = pd.concat([df[["y", "alpha"]], df1], ignore_index=True) + gb = self.get_groupby(df2, "y") + res = KDE()(df2, gb, "y", {}) + assert set(res["alpha"]) == set(df["alpha"]) + + @pytest.mark.parametrize("col", ["y", "weight"]) + def test_missing(self, df, col): + + val, ori = "xy" + df["weight"] = 1 + df = df[[ori, "weight"]] + df.loc[:4, col] = np.nan + gb = self.get_groupby(df, ori) + res = KDE()(df, gb, ori, {}) + assert self.integrate(res[val], res[ori]) == pytest.approx(1, abs=1e-3) From 13b1f55580eb7dcd5610802a54962ca239cca3fa Mon Sep 17 00:00:00 2001 From: Michael Waskom Date: Mon, 24 Oct 2022 09:08:20 -0400 Subject: [PATCH 4/6] Fix import --- seaborn/_stats/density.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seaborn/_stats/density.py b/seaborn/_stats/density.py index 733601c2e9..4af8596aee 100644 --- a/seaborn/_stats/density.py +++ b/seaborn/_stats/density.py @@ -10,7 +10,7 @@ from scipy.stats import gaussian_kde _no_scipy = False except ImportError: - from .external.kde import gaussian_kde + from seaborn.external.kde import gaussian_kde _no_scipy = True from seaborn._core.groupby import GroupBy From b09d2acd6421bc0d2eed34f627f50383daba1b5a Mon Sep 17 00:00:00 2001 From: Michael Waskom Date: Mon, 24 Oct 2022 18:43:26 -0400 Subject: [PATCH 5/6] Add KDE API examples --- doc/_docstrings/objects.KDE.ipynb | 270 ++++++++++++++++++++++++++++++ seaborn/_stats/density.py | 4 +- 2 files changed, 272 insertions(+), 2 deletions(-) create mode 100644 doc/_docstrings/objects.KDE.ipynb diff --git a/doc/_docstrings/objects.KDE.ipynb b/doc/_docstrings/objects.KDE.ipynb new file mode 100644 index 0000000000..851d882257 --- /dev/null +++ b/doc/_docstrings/objects.KDE.ipynb @@ -0,0 +1,270 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "dcc1ae12-bba4-4de9-af8d-543b3d65b42b", + "metadata": { + "tags": [ + "hide" + ] + }, + "outputs": [], + "source": [ + "import seaborn.objects as so\n", + "from seaborn import load_dataset\n", + "penguins = load_dataset(\"penguins\")" + ] + }, + { + "cell_type": "raw", + "id": "1042b991-1471-43bd-934c-43caae3cb2fa", + "metadata": {}, + "source": [ + "This stat estimates transforms observations into a smooth function representing the estimated density:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2406e2aa-7f0f-4a51-af59-4cef827d28d8", + "metadata": {}, + "outputs": [], + "source": [ + "p = so.Plot(penguins, x=\"flipper_length_mm\")\n", + "p.add(so.Area(), so.KDE())" + ] + }, + { + "cell_type": "raw", + "id": "44515f21-683b-420f-967b-4c7568c907d7", + "metadata": {}, + "source": [ + "Adjust the smoothing bandwidth to see more or fewer details:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4e6ba5b-4dd2-4210-8cf0-de057dc71e2a", + "metadata": {}, + "outputs": [], + "source": [ + "p.add(so.Area(), so.KDE(bw_adjust=0.25))" + ] + }, + { + "cell_type": "raw", + "id": "fd665fe1-a5e4-4742-adc9-e40615d57d08", + "metadata": {}, + "source": [ + "The curve will extend beyond observed values in the dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cda1cb8-f663-4f94-aa24-6f1727a41031", + "metadata": {}, + "outputs": [], + "source": [ + "p2 = p.add(so.Bars(alpha=.3), so.Hist(\"density\"))\n", + "p2.add(so.Line(), so.KDE())" + ] + }, + { + "cell_type": "raw", + "id": "75235825-d522-4562-aacc-9b7413eabf5d", + "metadata": {}, + "source": [ + "Control the range of the density curve relative to the observations using `cut`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7a9275e-9889-437d-bdc5-18653d2c92ef", + "metadata": {}, + "outputs": [], + "source": [ + "p2.add(so.Line(), so.KDE(cut=0))" + ] + }, + { + "cell_type": "raw", + "id": "6a885eeb-81ba-47c6-8402-1bef40544fd1", + "metadata": {}, + "source": [ + "When observations are assigned to the `y` variable, the density will be shown for `x`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38b3a0fb-54ff-493a-bd64-f83a12365723", + "metadata": {}, + "outputs": [], + "source": [ + "so.Plot(penguins, y=\"flipper_length_mm\").add(so.Area(), so.KDE())" + ] + }, + { + "cell_type": "raw", + "id": "59996340-168e-479f-a0c6-c7e1fcab0fb0", + "metadata": {}, + "source": [ + "Use `gridsize` to increase or decrease the resolution of the grid where the density is evaluated:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23715820-7df9-40ba-9e74-f11564704dd0", + "metadata": {}, + "outputs": [], + "source": [ + "p.add(so.Dots(), so.KDE(gridsize=100))" + ] + }, + { + "cell_type": "raw", + "id": "4c9b6492-98c8-45ab-9f53-681cde2f767a", + "metadata": {}, + "source": [ + "Or pass `None` to evaluate the density at the original datapoints:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e1b6810-5c28-43aa-aa61-652521299b51", + "metadata": {}, + "outputs": [], + "source": [ + "p.add(so.Dots(), so.KDE(gridsize=None))" + ] + }, + { + "cell_type": "raw", + "id": "0970a56b-0cba-4c40-bb1b-b8e71739df5c", + "metadata": {}, + "source": [ + "Other variables will define groups for the estimation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f0ce0b6-5742-4bc0-9ac3-abedde923684", + "metadata": {}, + "outputs": [], + "source": [ + "p.add(so.Area(), so.KDE(), color=\"species\")" + ] + }, + { + "cell_type": "raw", + "id": "22204fcd-4b25-46e5-a170-02b1419c23d5", + "metadata": {}, + "source": [ + "By default, the density is normalized across all groups (i.e., the joint density is shown); pass `common_norm=False` to show conditional densities:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ad56958-dc45-4632-94d1-23039ad3ec58", + "metadata": {}, + "outputs": [], + "source": [ + "p.add(so.Area(), so.KDE(common_norm=False), color=\"species\")" + ] + }, + { + "cell_type": "raw", + "id": "b1627197-85d1-4476-b4ae-3e93044ee988", + "metadata": {}, + "source": [ + "Or pass a list of variables to condition on:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58f63734-5afd-4d90-bbfb-fc39c8d1981f", + "metadata": {}, + "outputs": [], + "source": [ + "(\n", + " p.facet(\"sex\")\n", + " .add(so.Area(), so.KDE(common_norm=[\"col\"]), color=\"species\")\n", + ")" + ] + }, + { + "cell_type": "raw", + "id": "2b7e018e-1374-4939-909c-e95f5ffd086e", + "metadata": {}, + "source": [ + "This stat can be combined with other transforms, such as :class:`Stack` (when `common_grid=True`):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96e5b2d0-c7e2-47df-91f1-7f9ec0bb08a9", + "metadata": {}, + "outputs": [], + "source": [ + "p.add(so.Area(), so.KDE(), so.Stack(), color=\"sex\")" + ] + }, + { + "cell_type": "raw", + "id": "8500ff86-0b1f-4831-954b-08b6df690387", + "metadata": {}, + "source": [ + "Set `cumulative=True` to integrate the density:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26bb736e-7cfd-421e-b80d-42fa450e88c0", + "metadata": {}, + "outputs": [], + "source": [ + "p.add(so.Line(), so.KDE(cumulative=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8bfd9d2-ad60-4971-aa7f-71a285f44a20", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py310", + "language": "python", + "name": "py310" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/seaborn/_stats/density.py b/seaborn/_stats/density.py index 4af8596aee..e461387651 100644 --- a/seaborn/_stats/density.py +++ b/seaborn/_stats/density.py @@ -27,12 +27,12 @@ class KDE(Stat): ---------- bw_adjust : float Factor that multiplicatively scales the value chosen using - ``bw_method``. Increasing will make the curve smoother. See Notes. + `bw_method`. Increasing will make the curve smoother. See Notes. bw_method : string, scalar, or callable Method for determining the smoothing bandwidth to use. Passed directly to :class:`scipy.stats.gaussian_kde`; see there for options. common_norm : bool or list of variables - If `True`, normalize so that the sum of all curves sums to 1. + If `True`, normalize so that the areas of all curves sums to 1. If `False`, normalize each curve independently. If a list, defines variable(s) to group by and normalize within. common_grid : bool or list of variables From 21b2cee52ab66e2c2f396a2ccbf22c634cdc4303 Mon Sep 17 00:00:00 2001 From: Michael Waskom Date: Mon, 24 Oct 2022 18:45:01 -0400 Subject: [PATCH 6/6] Update release notes --- doc/whatsnew/v0.12.2.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 doc/whatsnew/v0.12.2.rst diff --git a/doc/whatsnew/v0.12.2.rst b/doc/whatsnew/v0.12.2.rst new file mode 100644 index 0000000000..dc29ab7402 --- /dev/null +++ b/doc/whatsnew/v0.12.2.rst @@ -0,0 +1,5 @@ + +v0.12.2 (Unreleased) +-------------------- + +- Added the :class:`objects.KDE` stat (:pr:`3111`).