fix: improve description and correct plot for ‘auto’ correlation (#1119)

* fix: correct plot and formatting for ‘auto’ correlation * fix: create example ‘auto’ correlation notebook * fix: add example for 'auto' correlation in python script
ydataai · Nov 22, 2022 · 2617b92 · 2617b92
1 parent d19affe
commit 2617b92
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 7 deletions.
diff --git a/examples/features/correlation_auto_example.py b/examples/features/correlation_auto_example.py
@@ -0,0 +1,36 @@
+from pathlib import Path
+import pandas as pd
+
+from pandas_profiling import ProfileReport
+from pandas_profiling.utils.cache import cache_zipped_file
+
+"""
+The "Auto" correlation is an interpretable pairwise column metric of the following mapping:
+
+- Variable_type-Variable_type : Method, **Range** 
+- Categorical-Categorical     : Cramer's V, **[0,1]**
+- Numerical-Categorical       : Cramer's V, **[0,1]** (using a discretized numerical column)
+- Numerical-Numerical         : Spearman's Rho, **[-1,1]**
+
+"""
+
+# Download the UCI Bank Marketing Dataset- as seen in examples/bank_marketing_data/banking_data.py
+file_name = cache_zipped_file(
+    "bank-full.csv",
+    "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip",
+)
+
+df = pd.read_csv(file_name, sep=";")
+
+profile = ProfileReport(
+    df, title="Profile Report of the UCI Bank Marketing Dataset", explorative=True
+)
+
+
+# The simplest way to change the number of bins is either through your script or notebook.
+# This changes the granularity of the association measure for Numerical-Categorical column pairs.
+profile.config.correlations["auto"].n_bins = 8
+
+
+# The 'auto' correlation matrix is displayed with the other correlation matrices in the report.
+profile.to_file(Path("uci_bank_marketing_report.html"))
diff --git a/src/pandas_profiling/report/structure/correlations.py b/src/pandas_profiling/report/structure/correlations.py
@@ -55,20 +55,26 @@ def get_correlation_items(config: Settings, summary: dict) -> Optional[Renderabl
     The empirical estimators used for Cramér's V have been proved to be biased, even for large samples.
     We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found <a href='http://stats.lse.ac.uk/bergsma/pdf/cramerV3.pdf'>here</a>."""
 
-    auto_description = """The auto setting is an easily interpretable pairwise column metric of the following mapping:
-                        vartype-vartype         : method, 
-                        categorical-categorical : Cramer's V, 
-                        numerical-categorical   : Cramer's V (using a discretized numerical column), 
-                        numerical-numerical     : Spearman's ρ. 
-                        This configuration uses the best suitable for each pair of columns."""
+    auto_description = """
+                            The auto setting is an interpretable pairwise 
+                                column metric of the following mapping:
+                        <ul>
+                            <li>  Variable_type-Variable_type : Method, <strong> Range </strong> <br /> </li> 
+                            <li> Categorical-Categorical     : Cramer's V, <strong> [0,1] </strong> <br /> </li> 
+                            <li> Numerical-Categorical       : Cramer's V, <strong> [0,1] </strong> (using a discretized numerical column) <br /> </li> 
+                            <li> Numerical-Numerical         : Spearman's ρ, <strong> [-1,1] </strong> <br /> </li> 
+                        </ul>
+                        The number of bins used in the discretization for the Numerical-Categorical column pair can be changed
+                        using config.correlations["auto"].n_bins. The number of bins affects the granularity of the association you wish to measure. <br><br>
+                        This configuration uses the recommended metric for each pair of columns."""
 
     key_to_data = {
         "pearson": (-1, "Pearson's r", pearson_description),
         "spearman": (-1, "Spearman's ρ", spearman_description),
         "kendall": (-1, "Kendall's τ", kendall_description),
         "phi_k": (0, "Phik (φk)", phi_k_description),
         "cramers": (0, "Cramér's V (φc)", cramers_description),
-        "auto": (0, "Auto", auto_description),
+        "auto": (-1, "Auto", auto_description),
     }
 
     image_format = config.plot.image_format