diff --git a/CHANGELOG.md b/CHANGELOG.md index bc1df8018..44498c97c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Remove upper limitation on version of numba ([#1321](https://github.com/tinkoff-ai/etna/pull/1321)) - Optimize `TSDataset.describe` and `TSDataset.info` by vectorization ([#1344](https://github.com/tinkoff-ai/etna/pull/1344)) - Add documentation warning about using dill during loading ([#1346](https://github.com/tinkoff-ai/etna/pull/1346)) +- Vectorize metric computation ([#1347](https://github.com/tinkoff-ai/etna/pull/1347)) ### Fixed - Pipeline ensembles fail in `etna forecast` CLI ([#1331](https://github.com/tinkoff-ai/etna/pull/1331)) diff --git a/etna/metrics/__init__.py b/etna/metrics/__init__.py index 231931a49..839ec70e2 100644 --- a/etna/metrics/__init__.py +++ b/etna/metrics/__init__.py @@ -1,13 +1,12 @@ -from sklearn.metrics import mean_absolute_error as mae -from sklearn.metrics import mean_squared_error as mse -from sklearn.metrics import mean_squared_log_error as msle -from sklearn.metrics import median_absolute_error as medae -from sklearn.metrics import r2_score - from etna.metrics.base import Metric from etna.metrics.base import MetricAggregationMode +from etna.metrics.functional_metrics import mae from etna.metrics.functional_metrics import mape from etna.metrics.functional_metrics import max_deviation +from etna.metrics.functional_metrics import medae +from etna.metrics.functional_metrics import mse +from etna.metrics.functional_metrics import msle +from etna.metrics.functional_metrics import r2_score from etna.metrics.functional_metrics import rmse from etna.metrics.functional_metrics import sign from etna.metrics.functional_metrics import smape diff --git a/etna/metrics/base.py b/etna/metrics/base.py index 7d9b19df2..db93f4c2a 100644 --- a/etna/metrics/base.py +++ b/etna/metrics/base.py @@ -1,17 +1,19 @@ from abc import ABC from abc import abstractmethod from enum import Enum -from typing import Callable from typing import Dict from typing import Optional from typing import Union import numpy as np import pandas as pd +from typing_extensions import Protocol +from typing_extensions import assert_never from etna.core import BaseMixin from etna.datasets.tsdataset import TSDataset from etna.loggers import tslogger +from etna.metrics.functional_metrics import ArrayLike class MetricAggregationMode(str, Enum): @@ -27,6 +29,31 @@ def _missing_(cls, value): ) +class MetricFunctionSignature(str, Enum): + """Enum for different metric function signatures.""" + + #: function should expect arrays of y_pred and y_true with length ``n_timestamps`` and return scalar + array_to_scalar = "array_to_scalar" + + #: function should expect matrices of y_pred and y_true with shape ``(n_timestamps, n_segments)`` + #: and return vector of length ``n_segments`` + matrix_to_array = "matrix_to_array" + + @classmethod + def _missing_(cls, value): + raise NotImplementedError( + f"{value} is not a valid {cls.__name__}. Only {', '.join([repr(m.value) for m in cls])} signatures allowed" + ) + + +class MetricFunction(Protocol): + """Protocol for ``metric_fn`` parameter.""" + + @abstractmethod + def __call__(self, y_true: ArrayLike, y_pred: ArrayLike) -> ArrayLike: + pass + + class AbstractMetric(ABC): """Abstract class for metric.""" @@ -74,7 +101,13 @@ class Metric(AbstractMetric, BaseMixin): dataset and aggregates it according to mode. """ - def __init__(self, metric_fn: Callable[..., float], mode: str = MetricAggregationMode.per_segment, **kwargs): + def __init__( + self, + metric_fn: MetricFunction, + mode: str = MetricAggregationMode.per_segment, + metric_fn_signature: str = "array_to_scalar", + **kwargs, + ): """ Init Metric. @@ -89,21 +122,29 @@ def __init__(self, metric_fn: Callable[..., float], mode: str = MetricAggregatio * if "per-segment" -- does not aggregate metrics + metric_fn_signature: + type of signature of ``metric_fn`` (see :py:class:`~etna.metrics.base.MetricFunctionSignature`) kwargs: functional metric's params Raises ------ NotImplementedError: - it non existent mode is used + If non-existent ``mode`` is used. + NotImplementedError: + If non-existent ``metric_fn_signature`` is used. """ - self.metric_fn = metric_fn - self.kwargs = kwargs - if MetricAggregationMode(mode) == MetricAggregationMode.macro: + if MetricAggregationMode(mode) is MetricAggregationMode.macro: self._aggregate_metrics = self._macro_average - elif MetricAggregationMode(mode) == MetricAggregationMode.per_segment: + elif MetricAggregationMode(mode) is MetricAggregationMode.per_segment: self._aggregate_metrics = self._per_segment_average + + self._metric_fn_signature = MetricFunctionSignature(metric_fn_signature) + + self.metric_fn = metric_fn + self.kwargs = kwargs self.mode = mode + self.metric_fn_signature = metric_fn_signature @property def name(self) -> str: @@ -276,13 +317,21 @@ def __call__(self, y_true: TSDataset, y_pred: TSDataset) -> Union[float, Dict[st df_true = y_true[:, :, "target"].sort_index(axis=1) df_pred = y_pred[:, :, "target"].sort_index(axis=1) - metrics_per_segment = {} segments = df_true.columns.get_level_values("segment").unique() - for i, segment in enumerate(segments): - cur_y_true = df_true.iloc[:, i] - cur_y_pred = df_pred.iloc[:, i] - metrics_per_segment[segment] = self.metric_fn(y_true=cur_y_true, y_pred=cur_y_pred, **self.kwargs) + metrics_per_segment: Dict[str, float] + if self._metric_fn_signature is MetricFunctionSignature.array_to_scalar: + metrics_per_segment = {} + for i, segment in enumerate(segments): + cur_y_true = df_true.iloc[:, i].values + cur_y_pred = df_pred.iloc[:, i].values + metrics_per_segment[segment] = self.metric_fn(y_true=cur_y_true, y_pred=cur_y_pred, **self.kwargs) # type: ignore + elif self._metric_fn_signature is MetricFunctionSignature.matrix_to_array: + values = self.metric_fn(y_true=df_true.values, y_pred=df_pred.values, **self.kwargs) + metrics_per_segment = dict(zip(segments, values)) # type: ignore + else: + assert_never(self._metric_fn_signature) + metrics = self._aggregate_metrics(metrics_per_segment) return metrics diff --git a/etna/metrics/functional_metrics.py b/etna/metrics/functional_metrics.py index 30a6be5c4..404f1a757 100644 --- a/etna/metrics/functional_metrics.py +++ b/etna/metrics/functional_metrics.py @@ -1,14 +1,47 @@ +from enum import Enum from functools import partial -from typing import List +from typing import Optional +from typing import Sequence from typing import Union import numpy as np +from sklearn.metrics import mean_absolute_error as mae from sklearn.metrics import mean_squared_error as mse +from sklearn.metrics import mean_squared_log_error as msle +from sklearn.metrics import median_absolute_error as medae +from sklearn.metrics import r2_score +from typing_extensions import assert_never -ArrayLike = List[Union[float, List[float]]] +ArrayLike = Union[float, Sequence[float], Sequence[Sequence[float]]] -def mape(y_true: ArrayLike, y_pred: ArrayLike, eps: float = 1e-15) -> float: +class FunctionalMetricMultioutput(str, Enum): + """Enum for different functional metric multioutput modes.""" + + #: Compute one scalar value taking into account all outputs. + joint = "joint" + + #: Compute one value per each output. + raw_values = "raw_values" + + @classmethod + def _missing_(cls, value): + raise NotImplementedError( + f"{value} is not a valid {cls.__name__}. Only {', '.join([repr(m.value) for m in cls])} options allowed" + ) + + +def _get_axis_by_multioutput(multioutput: str) -> Optional[int]: + multioutput_enum = FunctionalMetricMultioutput(multioutput) + if multioutput_enum is FunctionalMetricMultioutput.joint: + return None + elif multioutput_enum is FunctionalMetricMultioutput.raw_values: + return 0 + else: + assert_never(multioutput_enum) + + +def mape(y_true: ArrayLike, y_pred: ArrayLike, eps: float = 1e-15, multioutput: str = "joint") -> ArrayLike: """Mean absolute percentage error. `Wikipedia entry on the Mean absolute percentage error @@ -26,14 +59,19 @@ def mape(y_true: ArrayLike, y_pred: ArrayLike, eps: float = 1e-15) -> float: Estimated target values. - eps: float=1e-15 + eps: MAPE is undefined for ``y_true[i]==0`` for any ``i``, so all zeros ``y_true[i]`` are clipped to ``max(eps, abs(y_true))``. + multioutput: + Defines aggregating of multiple output values + (see :py:class:`~etna.metrics.functional_metrics.FunctionalMetricMultioutput`). + Returns ------- - float - A non-negative floating point value (the best value is 0.0). + : + A non-negative floating point value (the best value is 0.0), or an array of floating point values, + one for each individual target. """ y_true_array, y_pred_array = np.asarray(y_true), np.asarray(y_pred) @@ -42,10 +80,12 @@ def mape(y_true: ArrayLike, y_pred: ArrayLike, eps: float = 1e-15) -> float: y_true_array = y_true_array.clip(eps) - return np.mean(np.abs((y_true_array - y_pred_array) / y_true_array)) * 100 + axis = _get_axis_by_multioutput(multioutput) + + return np.mean(np.abs((y_true_array - y_pred_array) / y_true_array), axis=axis) * 100 -def smape(y_true: ArrayLike, y_pred: ArrayLike, eps: float = 1e-15) -> float: +def smape(y_true: ArrayLike, y_pred: ArrayLike, eps: float = 1e-15, multioutput: str = "joint") -> ArrayLike: """Symmetric mean absolute percentage error. `Wikipedia entry on the Symmetric mean absolute percentage error @@ -70,22 +110,29 @@ def smape(y_true: ArrayLike, y_pred: ArrayLike, eps: float = 1e-15) -> float: SMAPE is undefined for ``y_true[i] + y_pred[i] == 0`` for any ``i``, so all zeros ``y_true[i] + y_pred[i]`` are clipped to ``max(eps, abs(y_true) + abs(y_pred))``. + multioutput: + Defines aggregating of multiple output values + (see :py:class:`~etna.metrics.functional_metrics.FunctionalMetricMultioutput`). + Returns ------- - float - A non-negative floating point value (the best value is 0.0). + : + A non-negative floating point value (the best value is 0.0), or an array of floating point values, + one for each individual target. """ y_true_array, y_pred_array = np.asarray(y_true), np.asarray(y_pred) if len(y_true_array.shape) != len(y_pred_array.shape): raise ValueError("Shapes of the labels must be the same") + axis = _get_axis_by_multioutput(multioutput) + return 100 * np.mean( - 2 * np.abs(y_pred_array - y_true_array) / (np.abs(y_true_array) + np.abs(y_pred_array)).clip(eps) + 2 * np.abs(y_pred_array - y_true_array) / (np.abs(y_true_array) + np.abs(y_pred_array)).clip(eps), axis=axis ) -def sign(y_true: ArrayLike, y_pred: ArrayLike) -> float: +def sign(y_true: ArrayLike, y_pred: ArrayLike, multioutput: str = "joint") -> ArrayLike: """Sign error metric. .. math:: @@ -103,20 +150,27 @@ def sign(y_true: ArrayLike, y_pred: ArrayLike) -> float: Estimated target values. + multioutput: + Defines aggregating of multiple output values + (see :py:class:`~etna.metrics.functional_metrics.FunctionalMetricMultioutput`). + Returns ------- - float - A floating point value (the best value is 0.0). + : + A floating point value, or an array of floating point values, + one for each individual target. """ y_true_array, y_pred_array = np.asarray(y_true), np.asarray(y_pred) if len(y_true_array.shape) != len(y_pred_array.shape): raise ValueError("Shapes of the labels must be the same") - return np.mean(np.sign(y_true_array - y_pred_array)) + axis = _get_axis_by_multioutput(multioutput) + return np.mean(np.sign(y_true_array - y_pred_array), axis=axis) -def max_deviation(y_true: ArrayLike, y_pred: ArrayLike) -> float: + +def max_deviation(y_true: ArrayLike, y_pred: ArrayLike, multioutput: str = "joint") -> ArrayLike: """Max Deviation metric. Parameters @@ -131,25 +185,31 @@ def max_deviation(y_true: ArrayLike, y_pred: ArrayLike) -> float: Estimated target values. + multioutput: + Defines aggregating of multiple output values + (see :py:class:`~etna.metrics.functional_metrics.FunctionalMetricMultioutput`). + Returns ------- - float - A floating point value (the best value is 0.0). + : + A non-negative floating point value (the best value is 0.0), or an array of floating point values, + one for each individual target. """ y_true_array, y_pred_array = np.asarray(y_true), np.asarray(y_pred) if len(y_true_array.shape) != len(y_pred_array.shape): raise ValueError("Shapes of the labels must be the same") - prefix_error_sum = np.cumsum(y_pred_array - y_true_array) + axis = _get_axis_by_multioutput(multioutput) - return max(np.abs(prefix_error_sum)) + prefix_error_sum = np.cumsum(y_pred_array - y_true_array, axis=axis) + return np.max(np.abs(prefix_error_sum), axis=axis) rmse = partial(mse, squared=False) -def wape(y_true: ArrayLike, y_pred: ArrayLike) -> float: +def wape(y_true: ArrayLike, y_pred: ArrayLike, multioutput: str = "joint") -> ArrayLike: """Weighted average percentage Error metric. .. math:: @@ -167,14 +227,24 @@ def wape(y_true: ArrayLike, y_pred: ArrayLike) -> float: Estimated target values. + multioutput: + Defines aggregating of multiple output values + (see :py:class:`~etna.metrics.functional_metrics.FunctionalMetricMultioutput`). + Returns ------- - float - A floating point value (the best value is 0.0). + : + A non-negative floating point value (the best value is 0.0), or an array of floating point values, + one for each individual target. """ y_true_array, y_pred_array = np.asarray(y_true), np.asarray(y_pred) if len(y_true_array.shape) != len(y_pred_array.shape): raise ValueError("Shapes of the labels must be the same") - return np.sum(np.abs(y_true_array - y_pred_array)) / np.sum(np.abs(y_true_array)) + axis = _get_axis_by_multioutput(multioutput) + + return np.sum(np.abs(y_true_array - y_pred_array), axis=axis) / np.sum(np.abs(y_true_array), axis=axis) # type: ignore + + +__all__ = ["mae", "mse", "msle", "medae", "r2_score", "mape", "smape", "sign", "max_deviation", "rmse", "wape"] diff --git a/etna/metrics/intervals_metrics.py b/etna/metrics/intervals_metrics.py index da4bb7a34..2f6bac480 100644 --- a/etna/metrics/intervals_metrics.py +++ b/etna/metrics/intervals_metrics.py @@ -8,9 +8,10 @@ from etna.datasets import TSDataset from etna.metrics.base import Metric from etna.metrics.base import MetricAggregationMode +from etna.metrics.functional_metrics import ArrayLike -def dummy(): +def dummy(y_true: ArrayLike, y_pred: ArrayLike) -> ArrayLike: return np.nan @@ -73,13 +74,17 @@ def __call__(self, y_true: TSDataset, y_pred: TSDataset) -> Union[float, Dict[st self._validate_nans(y_true=y_true, y_pred=y_pred) self._validate_tsdataset_quantiles(ts=y_pred, quantiles=self.quantiles) - segments = set(y_true.df.columns.get_level_values("segment")) - metrics_per_segment = {} - for segment in segments: - upper_quantile_flag = y_true[:, segment, "target"] <= y_pred[:, segment, f"target_{self.quantiles[1]:.4g}"] - lower_quantile_flag = y_true[:, segment, "target"] >= y_pred[:, segment, f"target_{self.quantiles[0]:.4g}"] + df_true = y_true[:, :, "target"].sort_index(axis=1) + df_pred_lower = y_pred[:, :, f"target_{self.quantiles[0]:.4g}"].sort_index(axis=1) + df_pred_upper = y_pred[:, :, f"target_{self.quantiles[1]:.4g}"].sort_index(axis=1) + + segments = df_true.columns.get_level_values("segment").unique() + + upper_quantile_flag = df_true.values <= df_pred_upper.values + lower_quantile_flag = df_true.values >= df_pred_lower.values + values = np.mean(upper_quantile_flag * lower_quantile_flag, axis=0) + metrics_per_segment = dict(zip(segments, values)) - metrics_per_segment[segment] = np.mean(upper_quantile_flag * lower_quantile_flag) metrics = self._aggregate_metrics(metrics_per_segment) return metrics @@ -140,13 +145,14 @@ def __call__(self, y_true: TSDataset, y_pred: TSDataset) -> Union[float, Dict[st self._validate_nans(y_true=y_true, y_pred=y_pred) self._validate_tsdataset_quantiles(ts=y_pred, quantiles=self.quantiles) - segments = set(y_true.df.columns.get_level_values("segment")) - metrics_per_segment = {} - for segment in segments: - upper_quantile = y_pred[:, segment, f"target_{self.quantiles[1]:.4g}"] - lower_quantile = y_pred[:, segment, f"target_{self.quantiles[0]:.4g}"] + df_true = y_true[:, :, "target"].sort_index(axis=1) + df_pred_lower = y_pred[:, :, f"target_{self.quantiles[0]:.4g}"].sort_index(axis=1) + df_pred_upper = y_pred[:, :, f"target_{self.quantiles[1]:.4g}"].sort_index(axis=1) + + segments = df_true.columns.get_level_values("segment").unique() - metrics_per_segment[segment] = np.abs(lower_quantile - upper_quantile).mean() + values = np.mean(np.abs(df_pred_upper.values - df_pred_lower.values), axis=0) + metrics_per_segment = dict(zip(segments, values)) metrics = self._aggregate_metrics(metrics_per_segment) return metrics diff --git a/etna/metrics/metrics.py b/etna/metrics/metrics.py index 0abe726cf..35be92791 100644 --- a/etna/metrics/metrics.py +++ b/etna/metrics/metrics.py @@ -1,16 +1,18 @@ -from etna.metrics import mae -from etna.metrics import mape -from etna.metrics import max_deviation -from etna.metrics import medae -from etna.metrics import mse -from etna.metrics import msle -from etna.metrics import r2_score -from etna.metrics import rmse -from etna.metrics import sign -from etna.metrics import smape -from etna.metrics import wape +from functools import partial + from etna.metrics.base import Metric from etna.metrics.base import MetricAggregationMode +from etna.metrics.functional_metrics import mae +from etna.metrics.functional_metrics import mape +from etna.metrics.functional_metrics import max_deviation +from etna.metrics.functional_metrics import medae +from etna.metrics.functional_metrics import mse +from etna.metrics.functional_metrics import msle +from etna.metrics.functional_metrics import r2_score +from etna.metrics.functional_metrics import rmse +from etna.metrics.functional_metrics import sign +from etna.metrics.functional_metrics import smape +from etna.metrics.functional_metrics import wape class MAE(Metric): @@ -34,7 +36,8 @@ def __init__(self, mode: str = MetricAggregationMode.per_segment, **kwargs): kwargs: metric's computation arguments """ - super().__init__(mode=mode, metric_fn=mae, **kwargs) + mae_per_output = partial(mae, multioutput="raw_values") + super().__init__(mode=mode, metric_fn=mae_per_output, metric_fn_signature="matrix_to_array", **kwargs) @property def greater_is_better(self) -> bool: @@ -63,7 +66,8 @@ def __init__(self, mode: str = MetricAggregationMode.per_segment, **kwargs): kwargs: metric's computation arguments """ - super().__init__(mode=mode, metric_fn=mse, **kwargs) + mse_per_output = partial(mse, multioutput="raw_values") + super().__init__(mode=mode, metric_fn=mse_per_output, metric_fn_signature="matrix_to_array", **kwargs) @property def greater_is_better(self) -> bool: @@ -92,7 +96,8 @@ def __init__(self, mode: str = MetricAggregationMode.per_segment, **kwargs): kwargs: metric's computation arguments """ - super().__init__(mode=mode, metric_fn=rmse, **kwargs) + rmse_per_output = partial(rmse, multioutput="raw_values") + super().__init__(mode=mode, metric_fn=rmse_per_output, metric_fn_signature="matrix_to_array", **kwargs) @property def greater_is_better(self) -> bool: @@ -120,7 +125,8 @@ def __init__(self, mode: str = MetricAggregationMode.per_segment, **kwargs): kwargs: metric's computation arguments """ - super().__init__(mode=mode, metric_fn=r2_score, **kwargs) + r2_per_output = partial(r2_score, multioutput="raw_values") + super().__init__(mode=mode, metric_fn=r2_per_output, metric_fn_signature="matrix_to_array", **kwargs) @property def greater_is_better(self) -> bool: @@ -149,7 +155,8 @@ def __init__(self, mode: str = MetricAggregationMode.per_segment, **kwargs): kwargs: metric's computation arguments """ - super().__init__(mode=mode, metric_fn=mape, **kwargs) + mape_per_output = partial(mape, multioutput="raw_values") + super().__init__(mode=mode, metric_fn=mape_per_output, metric_fn_signature="matrix_to_array", **kwargs) @property def greater_is_better(self) -> bool: @@ -178,7 +185,8 @@ def __init__(self, mode: str = MetricAggregationMode.per_segment, **kwargs): kwargs: metric's computation arguments """ - super().__init__(mode=mode, metric_fn=smape, **kwargs) + smape_per_output = partial(smape, multioutput="raw_values") + super().__init__(mode=mode, metric_fn=smape_per_output, metric_fn_signature="matrix_to_array", **kwargs) @property def greater_is_better(self) -> bool: @@ -207,7 +215,8 @@ def __init__(self, mode: str = MetricAggregationMode.per_segment, **kwargs): kwargs: metric's computation arguments """ - super().__init__(mode=mode, metric_fn=medae, **kwargs) + medae_per_output = partial(medae, multioutput="raw_values") + super().__init__(mode=mode, metric_fn=medae_per_output, metric_fn_signature="matrix_to_array", **kwargs) @property def greater_is_better(self) -> bool: @@ -237,7 +246,8 @@ def __init__(self, mode: str = MetricAggregationMode.per_segment, **kwargs): metric's computation arguments """ - super().__init__(mode=mode, metric_fn=msle, **kwargs) + msle_per_output = partial(msle, multioutput="raw_values") + super().__init__(mode=mode, metric_fn=msle_per_output, metric_fn_signature="matrix_to_array", **kwargs) @property def greater_is_better(self) -> bool: @@ -266,7 +276,8 @@ def __init__(self, mode: str = MetricAggregationMode.per_segment, **kwargs): kwargs: metric's computation arguments """ - super().__init__(mode=mode, metric_fn=sign, **kwargs) + sign_per_output = partial(sign, multioutput="raw_values") + super().__init__(mode=mode, metric_fn=sign_per_output, metric_fn_signature="matrix_to_array", **kwargs) @property def greater_is_better(self) -> None: @@ -295,7 +306,8 @@ def __init__(self, mode: str = MetricAggregationMode.per_segment, **kwargs): kwargs: metric's computation arguments """ - super().__init__(mode=mode, metric_fn=max_deviation, **kwargs) + max_deviation_per_output = partial(max_deviation, multioutput="raw_values") + super().__init__(mode=mode, metric_fn=max_deviation_per_output, metric_fn_signature="matrix_to_array", **kwargs) @property def greater_is_better(self) -> bool: @@ -323,7 +335,8 @@ def __init__(self, mode: str = MetricAggregationMode.per_segment, **kwargs): kwargs: metric's computation arguments """ - super().__init__(mode=mode, metric_fn=wape, **kwargs) + wape_per_output = partial(wape, multioutput="raw_values") + super().__init__(mode=mode, metric_fn=wape_per_output, metric_fn_signature="matrix_to_array", **kwargs) @property def greater_is_better(self) -> bool: diff --git a/etna/pipeline/base.py b/etna/pipeline/base.py index d6d232b7d..aa074aa86 100644 --- a/etna/pipeline/base.py +++ b/etna/pipeline/base.py @@ -27,6 +27,7 @@ from etna.loggers import tslogger from etna.metrics import Metric from etna.metrics import MetricAggregationMode +from etna.metrics.functional_metrics import ArrayLike Timestamp = Union[str, pd.Timestamp] @@ -301,7 +302,7 @@ def __init__(self, mode: str = MetricAggregationMode.per_segment, **kwargs): super().__init__(mode=mode, metric_fn=self._compute_metric, **kwargs) @staticmethod - def _compute_metric(y_true: np.ndarray, y_pred: np.ndarray) -> float: + def _compute_metric(y_true: ArrayLike, y_pred: ArrayLike) -> float: return 0.0 @property diff --git a/tests/test_metrics/test_functional_metrics.py b/tests/test_metrics/test_functional_metrics.py index a27d8fd27..f9198cf11 100644 --- a/tests/test_metrics/test_functional_metrics.py +++ b/tests/test_metrics/test_functional_metrics.py @@ -1,3 +1,4 @@ +import numpy.testing as npt import pytest from etna.metrics import mae @@ -20,12 +21,12 @@ def right_mae_value(): @pytest.fixture() def y_true_1d(): - return [1, 1] + return [1, 3] @pytest.fixture() def y_pred_1d(): - return [2, 2] + return [2, 4] @pytest.mark.parametrize( @@ -34,17 +35,17 @@ def y_pred_1d(): (mae, 1), (mse, 1), (rmse, 1), - (mape, 100), - (smape, 66.6666666667), + (mape, 66 + 2 / 3), + (smape, 47.6190476), (medae, 1), (r2_score, 0), (sign, -1), (max_deviation, 2), - (wape, 1), + (wape, 1 / 2), ), ) def test_all_1d_metrics(metric, right_metrics_value, y_true_1d, y_pred_1d): - assert round(metric(y_true_1d, y_pred_1d), 10) == right_metrics_value + npt.assert_almost_equal(metric(y_true_1d, y_pred_1d), right_metrics_value) def test_mle_metric_exception(y_true_1d, y_pred_1d): @@ -54,14 +55,29 @@ def test_mle_metric_exception(y_true_1d, y_pred_1d): msle(y_true_1d, y_pred_1d) +@pytest.mark.parametrize( + "metric", + ( + mape, + smape, + sign, + max_deviation, + wape, + ), +) +def test_all_wrong_mode(metric, y_true_1d, y_pred_1d): + with pytest.raises(NotImplementedError): + metric(y_true_1d, y_pred_1d, multioutput="unknown") + + @pytest.fixture() def y_true_2d(): - return [[1, 1], [1, 1]] + return [[10, 1], [11, 2]] @pytest.fixture() def y_pred_2d(): - return [[2, 2], [2, 2]] + return [[11, 2], [10, 1]] @pytest.mark.parametrize( @@ -70,14 +86,33 @@ def y_pred_2d(): (mae, 1), (mse, 1), (rmse, 1), - (mape, 100), - (smape, 66.6666666667), + (mape, 42 + 3 / 11), + (smape, 38.0952380), (medae, 1), - (r2_score, 0.0), - (sign, -1), - (max_deviation, 4), - (wape, 1), + (r2_score, -3), + (sign, 0), + (max_deviation, 2), + (wape, 1 / 6), + ), +) +def test_all_2d_metrics_joint(metric, right_metrics_value, y_true_2d, y_pred_2d): + npt.assert_almost_equal(metric(y_true_2d, y_pred_2d), right_metrics_value) + + +@pytest.mark.parametrize( + "metric, params, right_metrics_value", + ( + (mae, {"multioutput": "raw_values"}, [1, 1]), + (mse, {"multioutput": "raw_values"}, [1, 1]), + (rmse, {"multioutput": "raw_values"}, [1, 1]), + (mape, {"multioutput": "raw_values"}, [9.5454545, 75]), + (smape, {"multioutput": "raw_values"}, [9.5238095, 66 + 2 / 3]), + (medae, {"multioutput": "raw_values"}, [1, 1]), + (r2_score, {"multioutput": "raw_values"}, [-3, -3]), + (sign, {"multioutput": "raw_values"}, [0, 0]), + (max_deviation, {"multioutput": "raw_values"}, [1, 1]), + (wape, {"multioutput": "raw_values"}, [0.0952381, 2 / 3]), ), ) -def test_all_2d_metrics(metric, right_metrics_value, y_true_2d, y_pred_2d): - assert round(metric(y_true_2d, y_pred_2d), 10) == right_metrics_value +def test_all_2d_metrics_per_output(metric, params, right_metrics_value, y_true_2d, y_pred_2d): + npt.assert_almost_equal(metric(y_true_2d, y_pred_2d, **params), right_metrics_value) diff --git a/tests/test_metrics/test_intervals_metrics.py b/tests/test_metrics/test_intervals_metrics.py index b69149dfd..9595e0d53 100644 --- a/tests/test_metrics/test_intervals_metrics.py +++ b/tests/test_metrics/test_intervals_metrics.py @@ -18,7 +18,7 @@ def tsdataset_with_zero_width_quantiles(example_df): @pytest.fixture -def tsdataset_with_differnt_width_and_shifted_quantiles(example_df): +def tsdataset_with_different_width_and_shifted_quantiles(example_df): ts_train = TSDataset.to_dataset(example_df) ts_train = TSDataset(ts_train, freq="H") @@ -45,8 +45,8 @@ def test_width_metric_with_zero_width_quantiles(tsdataset_with_zero_width_quanti assert width_metric[segment] == expected_metric -def test_width_metric_with_differnt_width_and_shifted_quantiles(tsdataset_with_differnt_width_and_shifted_quantiles): - ts_train, ts_test = tsdataset_with_differnt_width_and_shifted_quantiles +def test_width_metric_with_different_width_and_shifted_quantiles(tsdataset_with_different_width_and_shifted_quantiles): + ts_train, ts_test = tsdataset_with_different_width_and_shifted_quantiles expected_metric = {"segment_1": 1.0, "segment_2": 0.0} width_metric = Width(mode="per-segment")(ts_train, ts_test) @@ -55,8 +55,10 @@ def test_width_metric_with_differnt_width_and_shifted_quantiles(tsdataset_with_d assert width_metric[segment] == expected_metric[segment] -def test_coverage_metric_with_differnt_width_and_shifted_quantiles(tsdataset_with_differnt_width_and_shifted_quantiles): - ts_train, ts_test = tsdataset_with_differnt_width_and_shifted_quantiles +def test_coverage_metric_with_different_width_and_shifted_quantiles( + tsdataset_with_different_width_and_shifted_quantiles, +): + ts_train, ts_test = tsdataset_with_different_width_and_shifted_quantiles expected_metric = {"segment_1": 0.0, "segment_2": 1.0} coverage_metric = Coverage(mode="per-segment")(ts_train, ts_test) diff --git a/tests/test_metrics/test_metrics.py b/tests/test_metrics/test_metrics.py index e65b4196c..1d02d5b98 100644 --- a/tests/test_metrics/test_metrics.py +++ b/tests/test_metrics/test_metrics.py @@ -1,4 +1,5 @@ from copy import deepcopy +from functools import partial import numpy as np import pandas as pd @@ -16,6 +17,7 @@ from etna.metrics import sign from etna.metrics import smape from etna.metrics import wape +from etna.metrics.base import Metric from etna.metrics.base import MetricAggregationMode from etna.metrics.metrics import MAE from etna.metrics.metrics import MAPE @@ -112,7 +114,7 @@ def test_metrics_per_segment(metric_class, train_test_dfs): "metric_class", (MAE, MSE, RMSE, MedAE, MSLE, MAPE, SMAPE, R2, Sign, MaxDeviation, DummyMetric, WAPE) ) def test_metrics_invalid_aggregation(metric_class): - """Check metrics behavior in case of invalid aggregation mode""" + """Check metrics behavior in case of invalid aggregation multioutput""" with pytest.raises(NotImplementedError): _ = metric_class(mode="a") @@ -210,6 +212,59 @@ def test_metrics_values(metric_class, metric_fn, train_test_dfs): assert value == true_metric_value +def _create_metric_class(metric_fn, metric_fn_signature, greater_is_better): + def make_init(metric_fn, metric_fn_signature): + def init(self, mode): + Metric.__init__(self=self, mode=mode, metric_fn=metric_fn, metric_fn_signature=metric_fn_signature) + + return init + + new_class = type( + "NewMetric", + (Metric,), + { + "__init__": make_init(metric_fn=metric_fn, metric_fn_signature=metric_fn_signature), + "greater_is_better": lambda: greater_is_better, + }, + ) + + return new_class + + +@pytest.mark.parametrize( + "metric_fn, matrix_to_array_params, greater_is_better", + ( + (mae, {"multioutput": "raw_values"}, False), + (mse, {"multioutput": "raw_values"}, False), + (rmse, {"multioutput": "raw_values"}, False), + (mape, {"multioutput": "raw_values"}, False), + (smape, {"multioutput": "raw_values"}, False), + (medae, {"multioutput": "raw_values"}, False), + (r2_score, {"multioutput": "raw_values"}, True), + (sign, {"multioutput": "raw_values"}, None), + (max_deviation, {"multioutput": "raw_values"}, False), + (wape, {"multioutput": "raw_values"}, False), + ), +) +def test_metrics_equivalence_of_signatures(metric_fn, matrix_to_array_params, greater_is_better, train_test_dfs): + forecast_df, true_df = train_test_dfs + + metric_1_class = _create_metric_class( + metric_fn=metric_fn, metric_fn_signature="array_to_scalar", greater_is_better=greater_is_better + ) + metric_1 = metric_1_class(mode="per-segment") + metric_fn_matrix_to_array = partial(metric_fn, **matrix_to_array_params) + metric_2_class = _create_metric_class( + metric_fn=metric_fn_matrix_to_array, metric_fn_signature="matrix_to_array", greater_is_better=greater_is_better + ) + metric_2 = metric_2_class(mode="per-segment") + + metric_1_values = metric_1(y_pred=forecast_df, y_true=true_df) + metric_2_values = metric_2(y_pred=forecast_df, y_true=true_df) + + assert metric_1_values == metric_2_values + + @pytest.mark.parametrize( "metric_class", (MAE, MSE, RMSE, MedAE, MSLE, MAPE, SMAPE, R2, Sign, MaxDeviation, DummyMetric, WAPE) )