Skip to content

Commit

Permalink
Fix problem with get_anomalies_prediction_interval (#1296)
Browse files Browse the repository at this point in the history
  • Loading branch information
ostreech1997 committed Jun 28, 2023
1 parent 3f4ce91 commit 72c8aea
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fix problem with segment name "target" in `StackingEnsemble` ([#1262](https://github.com/tinkoff-ai/etna/pull/1262))
- Fix `BasePipeline.forecast` when prediction intervals are estimated on history data with presence of NaNs ([#1291](https://github.com/tinkoff-ai/etna/pull/1291))
- Teach `BaseMixin.set_params` to work with nested `list` and `tuple` ([#1201](https://github.com/tinkoff-ai/etna/pull/1201))
- Fix `get_anomalies_prediction_interval` to work when segments have different start date ([#1296](https://github.com/tinkoff-ai/etna/pull/1296))

## [2.0.0] - 2023-04-11
### Added
Expand Down
50 changes: 37 additions & 13 deletions etna/analysis/outliers/prediction_interval_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@
from typing import Type
from typing import Union

import numpy as np
import pandas as pd

from etna.datasets import TSDataset

if TYPE_CHECKING:
from etna.datasets import TSDataset
from etna.models import ProphetModel
from etna.models import SARIMAXModel


def create_ts_by_column(ts: "TSDataset", column: str) -> "TSDataset":
def create_ts_by_column(ts: TSDataset, column: str) -> TSDataset:
"""Create TSDataset based on original ts with selecting only column in each segment and setting it to target.
Parameters
Expand All @@ -29,16 +29,40 @@ def create_ts_by_column(ts: "TSDataset", column: str) -> "TSDataset":
result: TSDataset
dataset with selected column.
"""
from etna.datasets import TSDataset

new_df = ts[:, :, [column]]
new_columns_tuples = [(x[0], "target") for x in new_df.columns.tolist()]
new_df.columns = pd.MultiIndex.from_tuples(new_columns_tuples, names=new_df.columns.names)
return TSDataset(new_df, freq=ts.freq)


def _select_segments_subset(ts: TSDataset, segments: List[str]) -> TSDataset:
"""Create TSDataset with certain segments.
Parameters
----------
ts:
dataset with timeseries data
segments:
list with segments names
Returns
-------
result: TSDataset
dataset with selected column.
"""
df = ts.raw_df.loc[:, pd.IndexSlice[segments, :]].copy()
df = df.dropna()
df_exog = ts.df_exog
if df_exog is not None:
df_exog = df_exog.loc[df.index, pd.IndexSlice[segments, :]].copy()
known_future = ts.known_future
freq = ts.freq
subset_ts = TSDataset(df=df, df_exog=df_exog, known_future=known_future, freq=freq)
return subset_ts


def get_anomalies_prediction_interval(
ts: "TSDataset",
ts: TSDataset,
model: Union[Type["ProphetModel"], Type["SARIMAXModel"]],
interval_width: float = 0.95,
in_column: str = "target",
Expand Down Expand Up @@ -78,18 +102,18 @@ def get_anomalies_prediction_interval(
else:
ts_inner = create_ts_by_column(ts, in_column)
outliers_per_segment = {}
time_points = np.array(ts.index.values)
model_instance = model(**model_params)
model_instance.fit(ts_inner)
lower_p, upper_p = [(1 - interval_width) / 2, (1 + interval_width) / 2]
prediction_interval = model_instance.predict(
deepcopy(ts_inner), prediction_interval=True, quantiles=[lower_p, upper_p]
)
for segment in ts_inner.segments:
predicted_segment_slice = prediction_interval[:, segment, :][segment]
actual_segment_slice = ts_inner[:, segment, :][segment]
ts_segment = _select_segments_subset(ts=ts_inner, segments=[segment])
prediction_interval = model_instance.predict(
deepcopy(ts_segment), prediction_interval=True, quantiles=[lower_p, upper_p]
)
actual_segment_slice = ts_segment[:, segment, :][segment]
predicted_segment_slice = prediction_interval[actual_segment_slice.index, segment, :][segment]
anomalies_mask = (actual_segment_slice["target"] > predicted_segment_slice[f"target_{upper_p:.4g}"]) | (
actual_segment_slice["target"] < predicted_segment_slice[f"target_{lower_p:.4g}"]
)
outliers_per_segment[segment] = list(time_points[anomalies_mask])
outliers_per_segment[segment] = list(predicted_segment_slice[anomalies_mask].index.values)
return outliers_per_segment
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,14 @@ def test_get_anomalies_prediction_interval_values(outliers_tsds, model, interval
)
== true_anomalies
)


@pytest.mark.parametrize(
"model, interval_width, in_column",
(
(ProphetModel, 0.95, "target"),
(SARIMAXModel, 0.999, "target"),
),
)
def test_get_anomalies_prediction_interval_imbalanced_tsdf(imbalanced_tsdf, model, interval_width, in_column):
get_anomalies_prediction_interval(imbalanced_tsdf, model=model, interval_width=interval_width, in_column=in_column)

1 comment on commit 72c8aea

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.