Skip to content

Commit

Permalink
fix: remove computation from missing data plots (#1294)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexbarros authored Mar 27, 2023
1 parent 27ef702 commit eb80c72
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 61 deletions.
28 changes: 25 additions & 3 deletions src/ydata_profiling/model/pandas/missing_pandas.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd

from ydata_profiling.config import Settings
Expand All @@ -11,14 +12,35 @@

@missing_bar.register
def pandas_missing_bar(config: Settings, df: pd.DataFrame) -> str:
return plot_missing_bar(config, df)
notnull_counts = len(df) - df.isnull().sum()
return plot_missing_bar(
config,
notnull_counts=notnull_counts,
nrows=len(df),
columns=list(df.columns),
)


@missing_matrix.register
def pandas_missing_matrix(config: Settings, df: pd.DataFrame) -> str:
return plot_missing_matrix(config, df)
return plot_missing_matrix(
config,
columns=list(df.columns),
notnull=df.notnull().values,
nrows=len(df),
)


@missing_heatmap.register
def pandas_missing_heatmap(config: Settings, df: pd.DataFrame) -> str:
return plot_missing_heatmap(config, df)
# Remove completely filled or completely empty variables.
columns = [i for i, n in enumerate(np.var(df.isnull(), axis="rows")) if n > 0]
df = df.iloc[:, columns]

# Create and mask the correlation matrix. Construct the base heatmap.
corr_mat = df.isnull().corr()
mask = np.zeros_like(corr_mat)
mask[np.triu_indices_from(mask)] = True
return plot_missing_heatmap(
config, corr_mat=corr_mat, mask=mask, columns=list(df.columns)
)
28 changes: 22 additions & 6 deletions src/ydata_profiling/model/spark/missing_spark.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Any, List, Optional

import numpy as np
from pyspark.sql import DataFrame

from ydata_profiling.config import Settings
Expand Down Expand Up @@ -67,18 +68,33 @@ def spark_missing_bar(config: Settings, df: DataFrame) -> str:
)

return plot_missing_bar(
config,
MissingnoBarSparkPatch(
df=data_nan_counts, columns=df.columns, original_df_size=df.count()
),
config, notnull_counts=data_nan_counts, columns=df.columns, nrows=df.count()
)


@missing_matrix.register
def spark_missing_matrix(config: Settings, df: DataFrame) -> str:
return plot_missing_matrix(config, MissingnoBarSparkPatch(df))
df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count())
return plot_missing_matrix(
config,
columns=df.columns,
notnull=df.notnull().values,
nrows=len(df),
)


@missing_heatmap.register
def spark_missing_heatmap(config: Settings, df: DataFrame) -> str:
return plot_missing_heatmap(config, MissingnoBarSparkPatch(df))
df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count())

# Remove completely filled or completely empty variables.
columns = [i for i, n in enumerate(np.var(df.isnull(), axis="rows")) if n > 0]
df = df.iloc[:, columns]

# Create and mask the correlation matrix. Construct the base heatmap.
corr_mat = df.isnull().corr()
mask = np.zeros_like(corr_mat)
mask[np.triu_indices_from(mask)] = True
return plot_missing_heatmap(
config, corr_mat=corr_mat, mask=mask, columns=list(df.columns)
)
63 changes: 40 additions & 23 deletions src/ydata_profiling/visualisation/missing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Plotting functions for the missing values diagrams"""
import pandas as pd
from typing import Any, List

from matplotlib import pyplot as plt

from ydata_profiling.config import Settings
Expand All @@ -12,22 +13,22 @@
from ydata_profiling.visualisation.utils import hex_to_rgb, plot_360_n0sc0pe


def get_font_size(data: pd.DataFrame) -> float:
def get_font_size(columns: List[str]) -> float:
"""Calculate font size based on number of columns
Args:
data: DataFrame
columns: List of column names.
Returns:
Font size for missing values plots.
"""
max_label_length = max(len(label) for label in data.columns)
max_label_length = max(len(label) for label in columns)

if len(data.columns) < 20:
if len(columns) < 20:
font_size = 13.0
elif 20 <= len(data.columns) < 40:
elif 20 <= len(columns) < 40:
font_size = 12.0
elif 40 <= len(data.columns) < 60:
elif 40 <= len(columns) < 60:
font_size = 10.0
else:
font_size = 8.0
Expand All @@ -37,21 +38,27 @@ def get_font_size(data: pd.DataFrame) -> float:


@manage_matplotlib_context()
def plot_missing_matrix(config: Settings, data: pd.DataFrame) -> str:
def plot_missing_matrix(
config: Settings, notnull: Any, columns: List[str], nrows: int
) -> str:
"""Generate missing values matrix plot
Args:
config: report Settings object
data: Pandas DataFrame to generate missing values matrix from.
notnull: Missing data indicator matrix.
columns: List of column names.
nrows: Number of rows in the dataframe.
Returns:
The resulting missing values matrix encoded as a string.
"""

missing_matrix(
data,
notnull=notnull,
height=nrows,
columns=columns,
figsize=(10, 4),
fontsize=get_font_size(data) / 20 * 16,
fontsize=get_font_size(columns) / 20 * 16,
color=hex_to_rgb(config.html.style.primary_colors[0]),
labels=config.plot.missing.force_labels,
)
Expand All @@ -60,20 +67,25 @@ def plot_missing_matrix(config: Settings, data: pd.DataFrame) -> str:


@manage_matplotlib_context()
def plot_missing_bar(config: Settings, data: pd.DataFrame) -> str:
def plot_missing_bar(
config: Settings, notnull_counts: list, nrows: int, columns: List[str]
) -> str:
"""Generate missing values bar plot.
Args:
config: report Settings object
data: Pandas DataFrame to generate missing values bar plot from.
notnull_counts: Number of nonnull values per column.
nrows: Number of rows in the dataframe.
columns: List of column names.
Returns:
The resulting missing values bar plot encoded as a string.
"""
missing_bar(
data,
notnull_counts=notnull_counts,
nrows=nrows,
figsize=(10, 5),
fontsize=get_font_size(data),
fontsize=get_font_size(columns),
color=hex_to_rgb(config.html.style.primary_colors[0]),
labels=config.plot.missing.force_labels,
)
Expand All @@ -85,35 +97,40 @@ def plot_missing_bar(config: Settings, data: pd.DataFrame) -> str:


@manage_matplotlib_context()
def plot_missing_heatmap(config: Settings, data: pd.DataFrame) -> str:
def plot_missing_heatmap(
config: Settings, corr_mat: Any, mask: Any, columns: List[str]
) -> str:
"""Generate missing values heatmap plot.
Args:
config: report Settings object
data: Pandas DataFrame to generate missing values heatmap plot from.
corr_mat: Correlation matrix.
maks: Upper-triangle mask.
columns: List of column names.
Returns:
The resulting missing values heatmap plot encoded as a string.
"""

height = 4
if len(data.columns) > 10:
height += int((len(data.columns) - 10) / 5)
if len(columns) > 10:
height += int((len(columns) - 10) / 5)
height = min(height, 10)

font_size = get_font_size(data)
if len(data.columns) > 40:
font_size = get_font_size(columns)
if len(columns) > 40:
font_size /= 1.4

missing_heatmap(
data,
corr_mat=corr_mat,
mask=mask,
figsize=(10, height),
fontsize=font_size,
cmap=config.plot.missing.cmap,
labels=config.plot.missing.force_labels,
)

if len(data.columns) > 40:
if len(columns) > 40:
plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.3)
else:
plt.subplots_adjust(left=0.2, right=0.9, top=0.8, bottom=0.3)
Expand Down
51 changes: 22 additions & 29 deletions src/ydata_profiling/visualisation/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,8 @@ def _set_visibility(


def missing_bar(
data: pd.DataFrame,
notnull_counts: pd.Series,
nrows: int,
figsize: Tuple[float, float] = (25, 10),
fontsize: float = 16,
labels: bool = True,
Expand All @@ -774,7 +775,8 @@ def missing_bar(
Inspired by https://github.com/ResidentMario/missingno
Args:
data: The input DataFrame.
notnull_counts: Number of nonnull values per column.
nrows: Number of rows in the dataframe.
figsize: The size of the figure to display.
fontsize: The figure's font size. This default to 16.
labels: Whether or not to display the column names. Would need to be turned off on particularly large
Expand All @@ -784,12 +786,10 @@ def missing_bar(
Returns:
The plot axis.
"""
null_counts = len(data) - data.isnull().sum()
values = null_counts.values
null_counts = null_counts / len(data)
percentage = notnull_counts / nrows

if len(values) <= 50:
ax0 = null_counts.plot.bar(figsize=figsize, fontsize=fontsize, color=color)
if len(notnull_counts) <= 50:
ax0 = percentage.plot.bar(figsize=figsize, fontsize=fontsize, color=color)
ax0.set_xticklabels(
ax0.get_xticklabels(),
ha="right",
Expand All @@ -801,17 +801,17 @@ def missing_bar(
ax1.set_xticks(ax0.get_xticks())
ax1.set_xlim(ax0.get_xlim())
ax1.set_xticklabels(
values, ha="left", fontsize=fontsize, rotation=label_rotation
notnull_counts, ha="left", fontsize=fontsize, rotation=label_rotation
)
else:
ax0 = null_counts.plot.barh(figsize=figsize, fontsize=fontsize, color=color)
ax0 = percentage.plot.barh(figsize=figsize, fontsize=fontsize, color=color)
ylabels = ax0.get_yticklabels() if labels else []
ax0.set_yticklabels(ylabels, fontsize=fontsize)

ax1 = ax0.twinx()
ax1.set_yticks(ax0.get_yticks())
ax1.set_ylim(ax0.get_ylim())
ax1.set_yticklabels(values, fontsize=fontsize)
ax1.set_yticklabels(notnull_counts, fontsize=fontsize)

for ax in [ax0, ax1]:
ax = _set_visibility(ax)
Expand All @@ -820,7 +820,9 @@ def missing_bar(


def missing_matrix(
data: pd.DataFrame,
notnull: Any,
columns: List[str],
height: int,
figsize: Tuple[float, float] = (25, 10),
color: Tuple[float, ...] = (0.41, 0.41, 0.41),
fontsize: float = 16,
Expand All @@ -833,7 +835,9 @@ def missing_matrix(
Inspired by https://github.com/ResidentMario/missingno
Args:
data: The input DataFrame.
notnull: Missing data indicator matrix.
columns: List of column names.
height: Number of rows in the dataframe.
figsize: The size of the figure to display.
fontsize: The figure's font size. Default to 16.
labels: Whether or not to display the column names when there is more than 50 columns.
Expand All @@ -842,9 +846,7 @@ def missing_matrix(
Returns:
The plot axis.
"""
height, width = data.shape

notnull = data.notnull().values
width = len(columns)
missing_grid = np.zeros((height, width, 3), dtype=np.float32)

missing_grid[notnull] = color
Expand All @@ -860,9 +862,7 @@ def missing_matrix(

ha = "left"
ax.set_xticks(list(range(0, width)))
ax.set_xticklabels(
list(data.columns), rotation=label_rotation, ha=ha, fontsize=fontsize
)
ax.set_xticklabels(columns, rotation=label_rotation, ha=ha, fontsize=fontsize)
ax.set_yticks([0, height - 1])
ax.set_yticklabels([1, height], fontsize=fontsize)

Expand All @@ -878,7 +878,8 @@ def missing_matrix(


def missing_heatmap(
data: pd.DataFrame,
corr_mat: Any,
mask: Any,
figsize: Tuple[float, float] = (20, 12),
fontsize: float = 16,
labels: bool = True,
Expand All @@ -895,7 +896,8 @@ def missing_heatmap(
Inspired by https://github.com/ResidentMario/missingno
Args:
data: The input DataFrame.
corr_mat: correlation matrix.
mask: Upper-triangle mask.
figsize: The size of the figure to display. Defaults to (20, 12).
fontsize: The figure's font size.
labels: Whether or not to label each matrix entry with its correlation (default is True).
Expand All @@ -906,15 +908,6 @@ def missing_heatmap(
The plot axis.
"""
_, ax = plt.subplots(1, 1, figsize=figsize)

# Remove completely filled or completely empty variables.
columns = [i for i, n in enumerate(np.var(data.isnull(), axis="rows")) if n > 0]
data = data.iloc[:, columns]

# Create and mask the correlation matrix. Construct the base heatmap.
corr_mat = data.isnull().corr()
mask = np.zeros_like(corr_mat)
mask[np.triu_indices_from(mask)] = True
norm_args = {"vmin": -1, "vmax": 1} if normalized_cmap else {}

if labels:
Expand Down

0 comments on commit eb80c72

Please sign in to comment.