From 317eddb7390c9b3b836108b6ffa65110b6163c33 Mon Sep 17 00:00:00 2001 From: Xinrong Meng Date: Thu, 12 Sep 2024 10:57:31 -0700 Subject: [PATCH] [SPARK-49606][PS][DOCS] Improve documentation of Pandas on Spark plotting API ### What changes were proposed in this pull request? Improve documentation of Pandas on Spark plotting API following pandas 2.2 (stable), see https://pandas.pydata.org/docs/reference/frame.html. ### Why are the changes needed? Better documentation and parity with pandas. ### Does this PR introduce _any_ user-facing change? Doc changes only. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #48083 from xinrong-meng/doc_impr. Authored-by: Xinrong Meng Signed-off-by: Dongjoon Hyun --- python/pyspark/pandas/plot/core.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/python/pyspark/pandas/plot/core.py b/python/pyspark/pandas/plot/core.py index 453b17834020e..067c7db664dee 100644 --- a/python/pyspark/pandas/plot/core.py +++ b/python/pyspark/pandas/plot/core.py @@ -540,7 +540,7 @@ def line(self, x=None, y=None, **kwargs): """ Plot DataFrame/Series as lines. - This function is useful to plot lines using Series's values + This function is useful to plot lines using DataFrame’s values as coordinates. Parameters @@ -606,6 +606,12 @@ def bar(self, x=None, y=None, **kwds): """ Vertical bar plot. + A bar plot is a plot that presents categorical data with rectangular + bars with lengths proportional to the values that they represent. A + bar plot shows comparisons among discrete categories. One axis of the + plot shows the specific categories being compared, and the other axis + represents a measured value. + Parameters ---------- x : label or position, optional @@ -797,7 +803,17 @@ def barh(self, x=None, y=None, **kwargs): def box(self, **kwds): """ - Make a box plot of the Series columns. + Make a box plot of the DataFrame columns. + + A box plot is a method for graphically depicting groups of numerical data through + their quartiles. The box extends from the Q1 to Q3 quartile values of the data, + with a line at the median (Q2). The whiskers extend from the edges of box to show + the range of the data. The position of the whiskers is set by default to + 1.5*IQR (IQR = Q3 - Q1) from the edges of the box. Outlier points are those past + the end of the whiskers. + + A consideration when using this chart is that the box and the whiskers can overlap, + which is very common when plotting small sets of data. Parameters ---------- @@ -851,9 +867,11 @@ def box(self, **kwds): def hist(self, bins=10, **kwds): """ Draw one histogram of the DataFrame’s columns. + A `histogram`_ is a representation of the distribution of data. This function calls :meth:`plotting.backend.plot`, on each series in the DataFrame, resulting in one histogram per column. + This is useful when the DataFrame’s Series are in a similar scale. .. _histogram: https://en.wikipedia.org/wiki/Histogram @@ -902,6 +920,10 @@ def kde(self, bw_method=None, ind=None, **kwargs): """ Generate Kernel Density Estimate plot using Gaussian kernels. + In statistics, kernel density estimation (KDE) is a non-parametric way to + estimate the probability density function (PDF) of a random variable. This + function uses Gaussian kernels and includes automatic bandwidth determination. + Parameters ---------- bw_method : scalar