diff --git a/examples/features/correlation_auto_example.py b/examples/features/correlation_auto_example.py new file mode 100644 index 000000000..f4e53d777 --- /dev/null +++ b/examples/features/correlation_auto_example.py @@ -0,0 +1,36 @@ +from pathlib import Path +import pandas as pd + +from pandas_profiling import ProfileReport +from pandas_profiling.utils.cache import cache_zipped_file + +""" +The "Auto" correlation is an interpretable pairwise column metric of the following mapping: + +- Variable_type-Variable_type : Method, **Range** +- Categorical-Categorical : Cramer's V, **[0,1]** +- Numerical-Categorical : Cramer's V, **[0,1]** (using a discretized numerical column) +- Numerical-Numerical : Spearman's Rho, **[-1,1]** + +""" + +# Download the UCI Bank Marketing Dataset- as seen in examples/bank_marketing_data/banking_data.py +file_name = cache_zipped_file( + "bank-full.csv", + "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip", +) + +df = pd.read_csv(file_name, sep=";") + +profile = ProfileReport( + df, title="Profile Report of the UCI Bank Marketing Dataset", explorative=True +) + + +# The simplest way to change the number of bins is either through your script or notebook. +# This changes the granularity of the association measure for Numerical-Categorical column pairs. +profile.config.correlations["auto"].n_bins = 8 + + +# The 'auto' correlation matrix is displayed with the other correlation matrices in the report. +profile.to_file(Path("uci_bank_marketing_report.html")) diff --git a/src/pandas_profiling/report/structure/correlations.py b/src/pandas_profiling/report/structure/correlations.py index 25e764ae2..279e10c43 100644 --- a/src/pandas_profiling/report/structure/correlations.py +++ b/src/pandas_profiling/report/structure/correlations.py @@ -55,12 +55,18 @@ def get_correlation_items(config: Settings, summary: dict) -> Optional[Renderabl The empirical estimators used for Cramér's V have been proved to be biased, even for large samples. We use a bias-corrected measure that has been proposed by Bergsma in 2013 that can be found here.""" - auto_description = """The auto setting is an easily interpretable pairwise column metric of the following mapping: - vartype-vartype : method, - categorical-categorical : Cramer's V, - numerical-categorical : Cramer's V (using a discretized numerical column), - numerical-numerical : Spearman's ρ. - This configuration uses the best suitable for each pair of columns.""" + auto_description = """ + The auto setting is an interpretable pairwise + column metric of the following mapping: + + The number of bins used in the discretization for the Numerical-Categorical column pair can be changed + using config.correlations["auto"].n_bins. The number of bins affects the granularity of the association you wish to measure.

+ This configuration uses the recommended metric for each pair of columns.""" key_to_data = { "pearson": (-1, "Pearson's r", pearson_description), @@ -68,7 +74,7 @@ def get_correlation_items(config: Settings, summary: dict) -> Optional[Renderabl "kendall": (-1, "Kendall's τ", kendall_description), "phi_k": (0, "Phik (φk)", phi_k_description), "cramers": (0, "Cramér's V (φc)", cramers_description), - "auto": (0, "Auto", auto_description), + "auto": (-1, "Auto", auto_description), } image_format = config.plot.image_format