Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: fist version of the gap analysis tab for ts #1410

Merged
merged 4 commits into from
Aug 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions src/ydata_profiling/model/pandas/describe_timeseries_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from statsmodels.tsa.stattools import adfuller

from ydata_profiling.config import Settings
from ydata_profiling.model.pandas.utils_pandas import get_period_and_frequency
from ydata_profiling.model.summary_algorithms import (
describe_numeric_1d,
describe_timeseries_1d,
Expand Down Expand Up @@ -141,6 +142,49 @@ def get_fft_peaks(
return threshold, orig_peaks, peaks


def compute_gap_stats(series: pd.Series) -> pd.Series:
"""Computes the intertevals in the series normalized by the period.

Args:
series (pd.Series): time series data to analysis.

Returns:
A series with the gaps intervals.
"""

gap = series.dropna()
index_name = gap.index.name if gap.index.name else "index"
gap = gap.reset_index()[index_name]
gap.index.name = None

if isinstance(series.index, pd.DatetimeIndex):
period, frequency = get_period_and_frequency(series.index)
period = pd.Timedelta(f"{period} {frequency}")
base_frequency = pd.Timedelta(f"1 {frequency}")
else:
period = np.abs(np.diff(series.index)).mean()
base_frequency = 1

diff = gap.diff()
anchors = gap[diff > period].index
gaps = []
for i in anchors:
gaps.append(gap.loc[gap.index[[i - 1, i]]].values)

stats = {
"period": period / base_frequency,
"min": diff.min() / base_frequency,
"max": diff.max() / base_frequency,
"mean": diff.mean() / base_frequency,
"std": diff.std() / base_frequency,
"series": series,
"gaps": gaps,
}
if isinstance(series.index, pd.DatetimeIndex):
stats["frequency"] = frequency
return stats


@describe_timeseries_1d.register
@series_hashable
@series_handle_nulls
Expand All @@ -164,5 +208,6 @@ def pandas_describe_timeseries_1d(
stats["stationary"] = is_stationary and not stats["seasonal"]
stats["addfuller"] = p_value
stats["series"] = series
stats["gap_stats"] = compute_gap_stats(series)

return config, series, stats
6 changes: 2 additions & 4 deletions src/ydata_profiling/model/pandas/timeseries_index_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pandas.api.types import is_numeric_dtype

from ydata_profiling.config import Settings
from ydata_profiling.model.pandas.utils_pandas import get_period_and_frequency
from ydata_profiling.model.timeseries_index import get_time_index_description


Expand All @@ -21,10 +22,7 @@ def pandas_get_time_index_description(
start = df.index.min()
end = df.index.max()
if isinstance(df.index, pd.DatetimeIndex):
freq = df.index.inferred_freq
delta = abs(np.diff(df.index)).mean()
delta = delta.astype(f"timedelta64[{df.index.inferred_freq}]")
period = delta.astype(float)
period, freq = get_period_and_frequency(df.index)
else:
freq = None
period = abs(np.diff(df.index)).mean()
Expand Down
21 changes: 21 additions & 0 deletions src/ydata_profiling/model/pandas/utils_pandas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from typing import Tuple

import numpy as np
import pandas as pd


def weighted_median(data: np.ndarray, weights: np.ndarray) -> int:
Expand All @@ -25,3 +28,21 @@ def weighted_median(data: np.ndarray, weights: np.ndarray) -> int:
else:
w_median = s_data[idx + 1]
return w_median


def get_period_and_frequency(index: pd.DatetimeIndex) -> Tuple[float, str]:
delta = abs(np.diff(index)).mean()
delta = pd.Timedelta(delta)
if delta.days > 0:
frequency = "Days"
period = delta / pd.Timedelta(days=1)
elif delta.seconds > 0:
frequency = "Seconds"
period = delta / pd.Timedelta(seconds=1)
elif delta.microseconds > 0:
frequency = "Microseconds"
period = delta / pd.Timedelta(microseconds=1)
else:
frequency = "Nanoseconds"
period = delta.nanoseconds / pd.Timedelta(nanoseconds=1)
return period, frequency
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,80 @@
VariableInfo,
)
from ydata_profiling.report.structure.variables.render_common import render_common
from ydata_profiling.visualisation.plot import histogram, mini_ts_plot, plot_acf_pacf
from ydata_profiling.visualisation.plot import (
histogram,
mini_ts_plot,
plot_acf_pacf,
plot_timeseries_gap_analysis,
)


def _render_gap_tab(config: Settings, summary: dict) -> Container:
gap_stats = [
{
"name": "period",
"value": fmt_numeric(
summary["gap_stats"]["period"], precision=config.report.precision
),
},
]
if "frequency" in summary["gap_stats"]:
gap_stats.append(
{
"name": "frequency",
"value": summary["gap_stats"]["frequency"],
}
)
gap_stats.extend(
[
{
"name": "min inverval",
"value": fmt_numeric(
summary["gap_stats"]["min"], precision=config.report.precision
),
},
{
"name": "max inverval",
"value": fmt_numeric(
summary["gap_stats"]["max"], precision=config.report.precision
),
},
{
"name": "mean inverval",
"value": fmt_numeric(
summary["gap_stats"]["mean"], precision=config.report.precision
),
},
{
"name": "interval std",
"value": fmt_numeric(
summary["gap_stats"]["std"], precision=config.report.precision
),
},
]
)
gap_table = Table(
gap_stats,
name="Intervals statistics",
style=config.html.style,
)

gap_plot = Image(
plot_timeseries_gap_analysis(
config, summary["gap_stats"]["series"], summary["gap_stats"]["gaps"]
),
image_format=config.plot.image_format,
alt="Gap plot",
name="",
anchor_id=f"{summary['varid']}_gap_plot",
)
return Container(
[gap_table, gap_plot],
image_format=config.plot.image_format,
sequence_type="grid",
name="Gap analysis",
anchor_id=f"{summary['varid']}_gap_analysis",
)


def render_timeseries(config: Settings, summary: dict) -> dict:
Expand Down Expand Up @@ -289,8 +362,10 @@ def render_timeseries(config: Settings, summary: dict) -> dict:
anchor_id=f"{varid}_ts_plot",
)

ts_gap = _render_gap_tab(config, summary)

template_variables["bottom"] = Container(
[statistics, hist, ts_plot, fq, evs, acf_pacf],
[statistics, hist, ts_plot, ts_gap, fq, evs, acf_pacf],
sequence_type="tabs",
anchor_id=f"{varid}bottom",
)
Expand Down
48 changes: 47 additions & 1 deletion src/ydata_profiling/visualisation/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from matplotlib.colors import Colormap, LinearSegmentedColormap, ListedColormap, rgb2hex
from matplotlib.dates import AutoDateLocator, ConciseDateFormatter
from matplotlib.patches import Patch
from matplotlib.ticker import FuncFormatter
from matplotlib.ticker import FuncFormatter, MaxNLocator
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from typeguard import typechecked
from wordcloud import WordCloud
Expand Down Expand Up @@ -557,6 +557,52 @@ def _format_ts_date_axis(
return axis


@manage_matplotlib_context()
def plot_timeseries_gap_analysis(
config: Settings,
series: Union[pd.Series, List[pd.Series]],
gaps: Union[pd.Series, List[pd.Series]],
figsize: tuple = (6, 3),
) -> matplotlib.figure.Figure:
"""Plot an line plot from the data and return the AxesSubplot object.
Args:
variables: The data to plot.
figsize: The size of the figure (width, height) in inches, default (6,4).
Returns:
The TimeSeries lineplot.
"""
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111)

colors = create_comparison_color_list(config)
if isinstance(series, list):
min_ = min(s.min() for s in series)
max_ = max(s.max() for s in series)
labels = config.html.style._labels
for serie, gaps_, color, label in zip(series, gaps, colors, labels):
serie.plot(
ax=ax,
label=label,
color=color,
alpha=0.65,
)
_format_ts_date_axis(serie, ax)
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
for gap in gaps_:
ax.fill_between(x=gap, y1=min_, y2=max_, color=color, alpha=0.25)
else:
series.plot(ax=ax)
_format_ts_date_axis(series, ax)
ax.yaxis.set_major_locator(MaxNLocator(integer=True))

for gap in gaps:
ax.fill_between(
x=gap, y1=series.min(), y2=series.max(), color=colors[0], alpha=0.25
)

return plot_360_n0sc0pe(config)


@manage_matplotlib_context()
def plot_overview_timeseries(
config: Settings,
Expand Down