From 57fd814b333c843d2df2f99477297ca3a2390d9e Mon Sep 17 00:00:00 2001 From: Piotr Czarnas Date: Fri, 11 Oct 2024 18:20:13 +0200 Subject: [PATCH] All anomaly detection rules migrated to use a library function. --- ..._differencing_percentile_moving_average.py | 56 +++++++------------ ...ncing_percentile_moving_average_30_days.py | 56 +++++++------------ .../percentile/anomaly_partition_row_count.py | 29 ++++------ .../anomaly_stationary_count_values.py | 29 ++++------ .../anomaly_stationary_percent_values.py | 31 ++++------ .../percentile/anomaly_timeliness_delay.py | 15 ++--- 6 files changed, 81 insertions(+), 135 deletions(-) diff --git a/home/rules/percentile/anomaly_differencing_percentile_moving_average.py b/home/rules/percentile/anomaly_differencing_percentile_moving_average.py index 8675cdf31d..621259bd98 100644 --- a/home/rules/percentile/anomaly_differencing_percentile_moving_average.py +++ b/home/rules/percentile/anomaly_differencing_percentile_moving_average.py @@ -19,6 +19,7 @@ import numpy as np import scipy import scipy.stats +from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly # rule specific parameters object, contains values received from the quality check threshold configuration @@ -109,31 +110,23 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR # using a 0-based calculation (scale from 0) upper_median_multiples_array = [(difference / differences_median_float - 1.0) for difference in differences_list if difference >= differences_median_float] - upper_multiples = np.array(upper_median_multiples_array, dtype=float) - upper_multiples_median = np.median(upper_multiples) - upper_multiples_std = scipy.stats.tstd(upper_multiples) + threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(upper_multiples_std) == 0: - threshold_upper = differences_median_float - else: - # Assumption: the historical data follows t-student distribution - upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std) - threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail)) + if threshold_upper_multiple is not None: threshold_upper = (threshold_upper_multiple + 1.0) * differences_median_float + else: + threshold_upper = rule_parameters.actual_value lower_median_multiples_array = [(-1.0 / (difference / differences_median_float)) for difference in differences_list if difference <= differences_median_float if difference != 0] - lower_multiples = np.array(lower_median_multiples_array, dtype=float) - lower_multiples_median = np.median(lower_multiples) - lower_multiples_std = scipy.stats.tstd(lower_multiples) + threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(lower_multiples_std) == 0: - threshold_lower = differences_median_float - else: - # Assumption: the historical data follows t-student distribution - lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, scale=lower_multiples_std) - threshold_lower_multiple = float(lower_readout_distribution.ppf(tail)) + if threshold_lower_multiple is not None: threshold_lower = differences_median_float * (-1.0 / threshold_lower_multiple) + else: + threshold_lower = rule_parameters.actual_value passed = threshold_lower <= actual_difference <= threshold_upper @@ -145,28 +138,21 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR else: # using unrestricted method for both positive and negative values upper_half_filtered = [difference for difference in differences_list if difference >= differences_median_float] - upper_half = np.array(upper_half_filtered, dtype=float) - upper_half_median = np.median(upper_half) - upper_half_std = scipy.stats.tstd(upper_half) + threshold_upper_result = detect_upper_bound_anomaly(values_above_median=upper_half_filtered, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(upper_half_std) == 0: - threshold_upper = differences_median_float + if threshold_upper_result is not None: + threshold_upper = threshold_upper_result else: - # Assumption: the historical data follows t-student distribution - upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_half_median, scale=upper_half_std) - threshold_upper = float(upper_readout_distribution.ppf(1 - tail)) + threshold_upper = rule_parameters.actual_value lower_half_list = [difference for difference in differences_list if difference <= differences_median_float] - lower_half = np.array(lower_half_list, dtype=float) - lower_half_median = np.median(lower_half) - lower_half_std = scipy.stats.tstd(lower_half) - - if float(lower_half_std) == 0: - threshold_lower = differences_median_float + threshold_lower_result = detect_lower_bound_anomaly(values_below_median=lower_half_list, + degrees_of_freedom=degrees_of_freedom, tail=tail) + if threshold_lower_result is not None: + threshold_lower = threshold_lower_result else: - # Assumption: the historical data follows t-student distribution - lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_half_median, scale=lower_half_std) - threshold_lower = float(lower_readout_distribution.ppf(tail)) + threshold_lower = rule_parameters.actual_value passed = threshold_lower <= actual_difference <= threshold_upper diff --git a/home/rules/percentile/anomaly_differencing_percentile_moving_average_30_days.py b/home/rules/percentile/anomaly_differencing_percentile_moving_average_30_days.py index 9dd6ec8324..a5ca4702b2 100644 --- a/home/rules/percentile/anomaly_differencing_percentile_moving_average_30_days.py +++ b/home/rules/percentile/anomaly_differencing_percentile_moving_average_30_days.py @@ -19,6 +19,7 @@ import numpy as np import scipy import scipy.stats +from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly # rule specific parameters object, contains values received from the quality check threshold configuration @@ -109,31 +110,23 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR # using a 0-based calculation (scale from 0) upper_median_multiples_array = [(difference / differences_median_float - 1.0) for difference in differences_list if difference >= differences_median_float] - upper_multiples = np.array(upper_median_multiples_array, dtype=float) - upper_multiples_median = np.median(upper_multiples) - upper_multiples_std = scipy.stats.tstd(upper_multiples) + threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(upper_multiples_std) == 0: - threshold_upper = differences_median_float - else: - # Assumption: the historical data follows t-student distribution - upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std) - threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail)) + if threshold_upper_multiple is not None: threshold_upper = (threshold_upper_multiple + 1.0) * differences_median_float + else: + threshold_upper = rule_parameters.actual_value lower_median_multiples_array = [(-1.0 / (difference / differences_median_float)) for difference in differences_list if difference <= differences_median_float if difference != 0] - lower_multiples = np.array(lower_median_multiples_array, dtype=float) - lower_multiples_median = np.median(lower_multiples) - lower_multiples_std = scipy.stats.tstd(lower_multiples) + threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(lower_multiples_std) == 0: - threshold_lower = differences_median_float - else: - # Assumption: the historical data follows t-student distribution - lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, scale=lower_multiples_std) - threshold_lower_multiple = float(lower_readout_distribution.ppf(tail)) + if threshold_lower_multiple is not None: threshold_lower = differences_median_float * (-1.0 / threshold_lower_multiple) + else: + threshold_lower = rule_parameters.actual_value passed = threshold_lower <= actual_difference <= threshold_upper @@ -145,28 +138,21 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR else: # using unrestricted method for both positive and negative values upper_half_filtered = [difference for difference in differences_list if difference >= differences_median_float] - upper_half = np.array(upper_half_filtered, dtype=float) - upper_half_median = np.median(upper_half) - upper_half_std = scipy.stats.tstd(upper_half) + threshold_upper_result = detect_upper_bound_anomaly(values_above_median=upper_half_filtered, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(upper_half_std) == 0: - threshold_upper = differences_median_float + if threshold_upper_result is not None: + threshold_upper = threshold_upper_result else: - # Assumption: the historical data follows t-student distribution - upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_half_median, scale=upper_half_std) - threshold_upper = float(upper_readout_distribution.ppf(1 - tail)) + threshold_upper = rule_parameters.actual_value lower_half_list = [difference for difference in differences_list if difference <= differences_median_float] - lower_half = np.array(lower_half_list, dtype=float) - lower_half_median = np.median(lower_half) - lower_half_std = scipy.stats.tstd(lower_half) - - if float(lower_half_std) == 0: - threshold_lower = differences_median_float + threshold_lower_result = detect_lower_bound_anomaly(values_below_median=lower_half_list, + degrees_of_freedom=degrees_of_freedom, tail=tail) + if threshold_lower_result is not None: + threshold_lower = threshold_lower_result else: - # Assumption: the historical data follows t-student distribution - lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_half_median, scale=lower_half_std) - threshold_lower = float(lower_readout_distribution.ppf(tail)) + threshold_lower = rule_parameters.actual_value passed = threshold_lower <= actual_difference <= threshold_upper diff --git a/home/rules/percentile/anomaly_partition_row_count.py b/home/rules/percentile/anomaly_partition_row_count.py index f3d157b3bc..3ae2117b88 100644 --- a/home/rules/percentile/anomaly_partition_row_count.py +++ b/home/rules/percentile/anomaly_partition_row_count.py @@ -19,6 +19,7 @@ import numpy as np import scipy import scipy.stats +from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly # rule specific parameters object, contains values received from the quality check threshold configuration @@ -103,31 +104,23 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR upper_median_multiples_array = [(readout / filtered_median_float - 1.0) for readout in extracted if readout >= filtered_median_float] - upper_multiples = np.array(upper_median_multiples_array, dtype=float) - upper_multiples_median = np.median(upper_multiples) - upper_multiples_std = scipy.stats.tstd(upper_multiples) + threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(upper_multiples_std) == 0: - threshold_upper = filtered_median_float - else: - # Assumption: the historical data follows t-student distribution - upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std) - threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail)) + if threshold_upper_multiple is not None: threshold_upper = (threshold_upper_multiple + 1.0) * filtered_median_float + else: + threshold_upper = rule_parameters.actual_value lower_median_multiples_array = [(-1.0 / (readout / filtered_median_float)) for readout in extracted if readout <= filtered_median_float if readout != 0] - lower_multiples = np.array(lower_median_multiples_array, dtype=float) - lower_multiples_median = np.median(lower_multiples) - lower_multiples_std = scipy.stats.tstd(lower_multiples) + threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(lower_multiples_std) == 0: - threshold_lower = filtered_median_float - else: - # Assumption: the historical data follows t-student distribution - lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, scale=lower_multiples_std) - threshold_lower_multiple = float(lower_readout_distribution.ppf(tail)) + if threshold_lower_multiple is not None: threshold_lower = filtered_median_float * (-1.0 / threshold_lower_multiple) + else: + threshold_lower = rule_parameters.actual_value passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper diff --git a/home/rules/percentile/anomaly_stationary_count_values.py b/home/rules/percentile/anomaly_stationary_count_values.py index 5369bbea6d..9ae1572c1d 100644 --- a/home/rules/percentile/anomaly_stationary_count_values.py +++ b/home/rules/percentile/anomaly_stationary_count_values.py @@ -19,6 +19,7 @@ import numpy as np import scipy import scipy.stats +from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly # rule specific parameters object, contains values received from the quality check threshold configuration @@ -105,31 +106,23 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR tail = rule_parameters.parameters.anomaly_percent / 100.0 upper_median_multiples_array = [(readout / filtered_median_float - 1.0) for readout in extracted if readout >= filtered_median_float] - upper_multiples = np.array(upper_median_multiples_array, dtype=float) - upper_multiples_median = np.median(upper_multiples) - upper_multiples_std = scipy.stats.tstd(upper_multiples) + threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(upper_multiples_std) == 0: - threshold_upper = filtered_median_float - else: - # Assumption: the historical data follows t-student distribution - upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std) - threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail)) + if threshold_upper_multiple is not None: threshold_upper = (threshold_upper_multiple + 1.0) * filtered_median_float + else: + threshold_upper = rule_parameters.actual_value lower_median_multiples_array = [(-1.0 / (readout / filtered_median_float)) for readout in extracted if readout <= filtered_median_float if readout != 0] - lower_multiples = np.array(lower_median_multiples_array, dtype=float) - lower_multiples_median = np.median(lower_multiples) - lower_multiples_std = scipy.stats.tstd(lower_multiples) + threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(lower_multiples_std) == 0: - threshold_lower = filtered_median_float - else: - # Assumption: the historical data follows t-student distribution - lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, scale=lower_multiples_std) - threshold_lower_multiple = float(lower_readout_distribution.ppf(tail)) + if threshold_lower_multiple is not None: threshold_lower = filtered_median_float * (-1.0 / threshold_lower_multiple) + else: + threshold_lower = rule_parameters.actual_value passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper diff --git a/home/rules/percentile/anomaly_stationary_percent_values.py b/home/rules/percentile/anomaly_stationary_percent_values.py index ad133350b9..a0477d0500 100644 --- a/home/rules/percentile/anomaly_stationary_percent_values.py +++ b/home/rules/percentile/anomaly_stationary_percent_values.py @@ -19,6 +19,7 @@ import numpy as np import scipy import scipy.stats +from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly # rule specific parameters object, contains values received from the quality check threshold configuration @@ -111,36 +112,26 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR else: upper_median_multiples_array = [1.0 / (1.0 - readout / 100.0) for readout in extracted if readout >= filtered_median_float] - upper_multiples = np.array(upper_median_multiples_array, dtype=float) - upper_multiples_median = np.median(upper_multiples) - upper_multiples_std = scipy.stats.tstd(upper_multiples) + threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(upper_multiples_std) == 0.0: - threshold_upper = filtered_median_float - else: - # Assumption: the historical data follows t-student distribution - upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, - scale=upper_multiples_std) - threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail)) + if threshold_upper_multiple is not None: threshold_upper = 100.0 - 100.0 * (1.0 / threshold_upper_multiple) + else: + threshold_upper = rule_parameters.actual_value if 0.0 in all_extracted: threshold_lower = 0.0 else: lower_median_multiples_array = [(-1.0 / (readout / filtered_median_float)) for readout in extracted if readout <= filtered_median_float] - lower_multiples = np.array(lower_median_multiples_array, dtype=float) - lower_multiples_median = np.median(lower_multiples) - lower_multiples_std = scipy.stats.tstd(lower_multiples) + threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(lower_multiples_std) == 0.0: - threshold_lower = filtered_median_float - else: - # Assumption: the historical data follows t-student distribution - lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, - scale=lower_multiples_std) - threshold_lower_multiple = float(lower_readout_distribution.ppf(tail)) + if threshold_lower_multiple is not None: threshold_lower = filtered_median_float * (-1.0 / threshold_lower_multiple) + else: + threshold_lower = rule_parameters.actual_value passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper diff --git a/home/rules/percentile/anomaly_timeliness_delay.py b/home/rules/percentile/anomaly_timeliness_delay.py index 4fa66b6179..7eb230b41e 100644 --- a/home/rules/percentile/anomaly_timeliness_delay.py +++ b/home/rules/percentile/anomaly_timeliness_delay.py @@ -19,6 +19,7 @@ import numpy as np import scipy import scipy.stats +from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly # rule specific parameters object, contains values received from the quality check threshold configuration @@ -102,17 +103,13 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR tail = rule_parameters.parameters.anomaly_percent / 100.0 upper_median_multiples_array = [(readout / filtered_median_float - 1.0) for readout in extracted if readout >= filtered_median_float] - upper_multiples = np.array(upper_median_multiples_array, dtype=float) - upper_multiples_median = np.median(upper_multiples) - upper_multiples_std = scipy.stats.tstd(upper_multiples) + threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array, + degrees_of_freedom=degrees_of_freedom, tail=tail) - if float(upper_multiples_std) == 0: - threshold_upper = filtered_median_float - else: - # Assumption: the historical data follows t-student distribution - upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std) - threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail)) + if threshold_upper_multiple is not None: threshold_upper = (threshold_upper_multiple + 1.0) * filtered_median_float + else: + threshold_upper = rule_parameters.actual_value threshold_lower = 0.0 # always, our target is to have a delay of 0.0 days