From 51fc7c7c7e162e2c4e6fa613bf41addcd8940e0e Mon Sep 17 00:00:00 2001 From: Alex Barros Date: Tue, 8 Aug 2023 12:22:59 -0300 Subject: [PATCH] feat: fist version of the gap analysis tab for ts (#1410) * feat: fist version of the gap analysis tab for ts * feat: add gap stats table * fix: adjust gap plot the image size * feat: new gap analysis visualization --- .../pandas/describe_timeseries_pandas.py | 45 +++++++++++ .../model/pandas/timeseries_index_pandas.py | 6 +- .../model/pandas/utils_pandas.py | 21 +++++ .../structure/variables/render_timeseries.py | 79 ++++++++++++++++++- src/ydata_profiling/visualisation/plot.py | 48 ++++++++++- 5 files changed, 192 insertions(+), 7 deletions(-) diff --git a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py index 186e009e9..df7a594e4 100644 --- a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py @@ -7,6 +7,7 @@ from statsmodels.tsa.stattools import adfuller from ydata_profiling.config import Settings +from ydata_profiling.model.pandas.utils_pandas import get_period_and_frequency from ydata_profiling.model.summary_algorithms import ( describe_numeric_1d, describe_timeseries_1d, @@ -141,6 +142,49 @@ def get_fft_peaks( return threshold, orig_peaks, peaks +def compute_gap_stats(series: pd.Series) -> pd.Series: + """Computes the intertevals in the series normalized by the period. + + Args: + series (pd.Series): time series data to analysis. + + Returns: + A series with the gaps intervals. + """ + + gap = series.dropna() + index_name = gap.index.name if gap.index.name else "index" + gap = gap.reset_index()[index_name] + gap.index.name = None + + if isinstance(series.index, pd.DatetimeIndex): + period, frequency = get_period_and_frequency(series.index) + period = pd.Timedelta(f"{period} {frequency}") + base_frequency = pd.Timedelta(f"1 {frequency}") + else: + period = np.abs(np.diff(series.index)).mean() + base_frequency = 1 + + diff = gap.diff() + anchors = gap[diff > period].index + gaps = [] + for i in anchors: + gaps.append(gap.loc[gap.index[[i - 1, i]]].values) + + stats = { + "period": period / base_frequency, + "min": diff.min() / base_frequency, + "max": diff.max() / base_frequency, + "mean": diff.mean() / base_frequency, + "std": diff.std() / base_frequency, + "series": series, + "gaps": gaps, + } + if isinstance(series.index, pd.DatetimeIndex): + stats["frequency"] = frequency + return stats + + @describe_timeseries_1d.register @series_hashable @series_handle_nulls @@ -164,5 +208,6 @@ def pandas_describe_timeseries_1d( stats["stationary"] = is_stationary and not stats["seasonal"] stats["addfuller"] = p_value stats["series"] = series + stats["gap_stats"] = compute_gap_stats(series) return config, series, stats diff --git a/src/ydata_profiling/model/pandas/timeseries_index_pandas.py b/src/ydata_profiling/model/pandas/timeseries_index_pandas.py index db35e1264..590a9a31e 100644 --- a/src/ydata_profiling/model/pandas/timeseries_index_pandas.py +++ b/src/ydata_profiling/model/pandas/timeseries_index_pandas.py @@ -4,6 +4,7 @@ from pandas.api.types import is_numeric_dtype from ydata_profiling.config import Settings +from ydata_profiling.model.pandas.utils_pandas import get_period_and_frequency from ydata_profiling.model.timeseries_index import get_time_index_description @@ -21,10 +22,7 @@ def pandas_get_time_index_description( start = df.index.min() end = df.index.max() if isinstance(df.index, pd.DatetimeIndex): - freq = df.index.inferred_freq - delta = abs(np.diff(df.index)).mean() - delta = delta.astype(f"timedelta64[{df.index.inferred_freq}]") - period = delta.astype(float) + period, freq = get_period_and_frequency(df.index) else: freq = None period = abs(np.diff(df.index)).mean() diff --git a/src/ydata_profiling/model/pandas/utils_pandas.py b/src/ydata_profiling/model/pandas/utils_pandas.py index 8ad910d01..502aeb916 100644 --- a/src/ydata_profiling/model/pandas/utils_pandas.py +++ b/src/ydata_profiling/model/pandas/utils_pandas.py @@ -1,4 +1,7 @@ +from typing import Tuple + import numpy as np +import pandas as pd def weighted_median(data: np.ndarray, weights: np.ndarray) -> int: @@ -25,3 +28,21 @@ def weighted_median(data: np.ndarray, weights: np.ndarray) -> int: else: w_median = s_data[idx + 1] return w_median + + +def get_period_and_frequency(index: pd.DatetimeIndex) -> Tuple[float, str]: + delta = abs(np.diff(index)).mean() + delta = pd.Timedelta(delta) + if delta.days > 0: + frequency = "Days" + period = delta / pd.Timedelta(days=1) + elif delta.seconds > 0: + frequency = "Seconds" + period = delta / pd.Timedelta(seconds=1) + elif delta.microseconds > 0: + frequency = "Microseconds" + period = delta / pd.Timedelta(microseconds=1) + else: + frequency = "Nanoseconds" + period = delta.nanoseconds / pd.Timedelta(nanoseconds=1) + return period, frequency diff --git a/src/ydata_profiling/report/structure/variables/render_timeseries.py b/src/ydata_profiling/report/structure/variables/render_timeseries.py index 897db862c..6f25328a1 100644 --- a/src/ydata_profiling/report/structure/variables/render_timeseries.py +++ b/src/ydata_profiling/report/structure/variables/render_timeseries.py @@ -14,7 +14,80 @@ VariableInfo, ) from ydata_profiling.report.structure.variables.render_common import render_common -from ydata_profiling.visualisation.plot import histogram, mini_ts_plot, plot_acf_pacf +from ydata_profiling.visualisation.plot import ( + histogram, + mini_ts_plot, + plot_acf_pacf, + plot_timeseries_gap_analysis, +) + + +def _render_gap_tab(config: Settings, summary: dict) -> Container: + gap_stats = [ + { + "name": "period", + "value": fmt_numeric( + summary["gap_stats"]["period"], precision=config.report.precision + ), + }, + ] + if "frequency" in summary["gap_stats"]: + gap_stats.append( + { + "name": "frequency", + "value": summary["gap_stats"]["frequency"], + } + ) + gap_stats.extend( + [ + { + "name": "min inverval", + "value": fmt_numeric( + summary["gap_stats"]["min"], precision=config.report.precision + ), + }, + { + "name": "max inverval", + "value": fmt_numeric( + summary["gap_stats"]["max"], precision=config.report.precision + ), + }, + { + "name": "mean inverval", + "value": fmt_numeric( + summary["gap_stats"]["mean"], precision=config.report.precision + ), + }, + { + "name": "interval std", + "value": fmt_numeric( + summary["gap_stats"]["std"], precision=config.report.precision + ), + }, + ] + ) + gap_table = Table( + gap_stats, + name="Intervals statistics", + style=config.html.style, + ) + + gap_plot = Image( + plot_timeseries_gap_analysis( + config, summary["gap_stats"]["series"], summary["gap_stats"]["gaps"] + ), + image_format=config.plot.image_format, + alt="Gap plot", + name="", + anchor_id=f"{summary['varid']}_gap_plot", + ) + return Container( + [gap_table, gap_plot], + image_format=config.plot.image_format, + sequence_type="grid", + name="Gap analysis", + anchor_id=f"{summary['varid']}_gap_analysis", + ) def render_timeseries(config: Settings, summary: dict) -> dict: @@ -289,8 +362,10 @@ def render_timeseries(config: Settings, summary: dict) -> dict: anchor_id=f"{varid}_ts_plot", ) + ts_gap = _render_gap_tab(config, summary) + template_variables["bottom"] = Container( - [statistics, hist, ts_plot, fq, evs, acf_pacf], + [statistics, hist, ts_plot, ts_gap, fq, evs, acf_pacf], sequence_type="tabs", anchor_id=f"{varid}bottom", ) diff --git a/src/ydata_profiling/visualisation/plot.py b/src/ydata_profiling/visualisation/plot.py index 079690237..104724bc9 100644 --- a/src/ydata_profiling/visualisation/plot.py +++ b/src/ydata_profiling/visualisation/plot.py @@ -11,7 +11,7 @@ from matplotlib.colors import Colormap, LinearSegmentedColormap, ListedColormap, rgb2hex from matplotlib.dates import AutoDateLocator, ConciseDateFormatter from matplotlib.patches import Patch -from matplotlib.ticker import FuncFormatter +from matplotlib.ticker import FuncFormatter, MaxNLocator from statsmodels.graphics.tsaplots import plot_acf, plot_pacf from typeguard import typechecked from wordcloud import WordCloud @@ -557,6 +557,52 @@ def _format_ts_date_axis( return axis +@manage_matplotlib_context() +def plot_timeseries_gap_analysis( + config: Settings, + series: Union[pd.Series, List[pd.Series]], + gaps: Union[pd.Series, List[pd.Series]], + figsize: tuple = (6, 3), +) -> matplotlib.figure.Figure: + """Plot an line plot from the data and return the AxesSubplot object. + Args: + variables: The data to plot. + figsize: The size of the figure (width, height) in inches, default (6,4). + Returns: + The TimeSeries lineplot. + """ + fig = plt.figure(figsize=figsize) + ax = fig.add_subplot(111) + + colors = create_comparison_color_list(config) + if isinstance(series, list): + min_ = min(s.min() for s in series) + max_ = max(s.max() for s in series) + labels = config.html.style._labels + for serie, gaps_, color, label in zip(series, gaps, colors, labels): + serie.plot( + ax=ax, + label=label, + color=color, + alpha=0.65, + ) + _format_ts_date_axis(serie, ax) + ax.yaxis.set_major_locator(MaxNLocator(integer=True)) + for gap in gaps_: + ax.fill_between(x=gap, y1=min_, y2=max_, color=color, alpha=0.25) + else: + series.plot(ax=ax) + _format_ts_date_axis(series, ax) + ax.yaxis.set_major_locator(MaxNLocator(integer=True)) + + for gap in gaps: + ax.fill_between( + x=gap, y1=series.min(), y2=series.max(), color=colors[0], alpha=0.25 + ) + + return plot_360_n0sc0pe(config) + + @manage_matplotlib_context() def plot_overview_timeseries( config: Settings,