This repository has been archived by the owner on Mar 13, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* bump: dependencies * companion: for #6789 * companion: for #7038 * companion: for #6961 * companion: for #6984 * companion: for #7007 * companion: for #7103 * companion: for #6825 * companion: for #7128 * companion: for #7131 * companion: for #7138 * bump: dependencies * companion: for #7148 * companion: for #7180 * companion: for #7232 * companion: for #7237 * fix: compile * update: format * companion: for #7058 * companion: for #7136 * companion: for #7155 * update: format * companion: for #7161 * companion: for #7176 * bump: dependencies * companion: for #6685 * companion: for #7111 * companion: for #7214 * companion: for #7215 * companion: for #5715 * companion: for #6685 part2 * update: runtime * update: darwinia-cli * companion: for #7039 * companion: for #6215 * companion: for #6948 * rename * todo: new service with #6999 * bump: dependencies
- Loading branch information
1 parent
fc3c3f3
commit 4e3ecee
Showing
82 changed files
with
8,388 additions
and
3,499 deletions.
There are no files selected for viewing
239 changes: 239 additions & 0 deletions
239
.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,239 @@ | ||
rule_files: | ||
- /dev/stdin | ||
|
||
evaluation_interval: 1m | ||
|
||
tests: | ||
- interval: 1m | ||
input_series: | ||
- series: 'darwinia_sub_libp2p_peers_count{ | ||
job="darwinia", | ||
pod="darwinia-abcdef01234-abcdef", | ||
instance="darwinia-abcdef01234-abcdef", | ||
}' | ||
values: '3 2+0x4 1+0x9' # 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 | ||
|
||
- series: 'darwinia_sub_txpool_validations_scheduled{ | ||
job="darwinia", | ||
pod="darwinia-abcdef01234-abcdef", | ||
instance="darwinia-abcdef01234-abcdef", | ||
}' | ||
values: '11+1x10 22+2x30 10043x5' | ||
|
||
- series: 'darwinia_sub_txpool_validations_finished{ | ||
job="darwinia", | ||
pod="darwinia-abcdef01234-abcdef", | ||
instance="darwinia-abcdef01234-abcdef", | ||
}' | ||
values: '0+1x42 42x5' | ||
|
||
- series: 'darwinia_block_height{ | ||
status="best", job="darwinia", | ||
pod="darwinia-abcdef01234-abcdef", | ||
instance="darwinia-abcdef01234-abcdef", | ||
}' | ||
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ... | ||
|
||
- series: 'darwinia_block_height{ | ||
status="finalized", | ||
job="darwinia", | ||
pod="darwinia-abcdef01234-abcdef", | ||
instance="darwinia-abcdef01234-abcdef", | ||
}' | ||
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ... | ||
|
||
alert_rule_test: | ||
|
||
###################################################################### | ||
# Block production | ||
###################################################################### | ||
|
||
- eval_time: 6m | ||
alertname: BlockProductionSlow | ||
exp_alerts: | ||
- eval_time: 7m | ||
alertname: BlockProductionSlow | ||
exp_alerts: | ||
- exp_labels: | ||
severity: warning | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
status: best | ||
exp_annotations: | ||
message: "Best block on instance | ||
darwinia-abcdef01234-abcdef increases by less than 1 per | ||
minute for more than 3 minutes." | ||
|
||
- eval_time: 14m | ||
alertname: BlockProductionSlow | ||
exp_alerts: | ||
- exp_labels: | ||
severity: warning | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
status: best | ||
exp_annotations: | ||
message: "Best block on instance | ||
darwinia-abcdef01234-abcdef increases by less than 1 per | ||
minute for more than 3 minutes." | ||
- exp_labels: | ||
severity: critical | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
status: best | ||
exp_annotations: | ||
message: "Best block on instance | ||
darwinia-abcdef01234-abcdef increases by less than 1 per | ||
minute for more than 10 minutes." | ||
|
||
###################################################################### | ||
# Block finalization | ||
###################################################################### | ||
|
||
- eval_time: 6m | ||
alertname: BlockFinalizationSlow | ||
exp_alerts: | ||
- eval_time: 7m | ||
alertname: BlockFinalizationSlow | ||
exp_alerts: | ||
- exp_labels: | ||
severity: warning | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
status: finalized | ||
exp_annotations: | ||
message: "Finalized block on instance | ||
darwinia-abcdef01234-abcdef increases by less than 1 per | ||
minute for more than 3 minutes." | ||
|
||
- eval_time: 14m | ||
alertname: BlockFinalizationSlow | ||
exp_alerts: | ||
- exp_labels: | ||
severity: warning | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
status: finalized | ||
exp_annotations: | ||
message: "Finalized block on instance | ||
darwinia-abcdef01234-abcdef increases by less than 1 per | ||
minute for more than 3 minutes." | ||
- exp_labels: | ||
severity: critical | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
status: finalized | ||
exp_annotations: | ||
message: "Finalized block on instance | ||
darwinia-abcdef01234-abcdef increases by less than 1 per | ||
minute for more than 10 minutes." | ||
|
||
###################################################################### | ||
# Transaction queue | ||
###################################################################### | ||
|
||
- eval_time: 11m | ||
alertname: TransactionQueueSizeIncreasing | ||
# Number of validations scheduled and finished both grow at a rate | ||
# of 1 in the first 10 minutes, thereby the queue is not increasing | ||
# in size, thus don't expect an alert. | ||
exp_alerts: | ||
- eval_time: 22m | ||
alertname: TransactionQueueSizeIncreasing | ||
# Number of validations scheduled is growing twice as fast as the | ||
# number of validations finished after minute 10. Thus expect | ||
# warning alert after 20 minutes. | ||
exp_alerts: | ||
- exp_labels: | ||
severity: warning | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
exp_annotations: | ||
message: "The transaction pool size on node | ||
darwinia-abcdef01234-abcdef has been monotonically | ||
increasing for more than 10 minutes." | ||
- eval_time: 43m | ||
alertname: TransactionQueueSizeIncreasing | ||
# Number of validations scheduled is growing twice as fast as the | ||
# number of validations finished after minute 10. Thus expect | ||
# both warning and critical alert after 40 minutes. | ||
exp_alerts: | ||
- exp_labels: | ||
severity: warning | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
exp_annotations: | ||
message: "The transaction pool size on node | ||
darwinia-abcdef01234-abcdef has been monotonically | ||
increasing for more than 10 minutes." | ||
- exp_labels: | ||
severity: critical | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
exp_annotations: | ||
message: "The transaction pool size on node | ||
darwinia-abcdef01234-abcdef has been monotonically | ||
increasing for more than 30 minutes." | ||
- eval_time: 49m | ||
alertname: TransactionQueueSizeHigh | ||
# After minute 43 the number of validations scheduled jumps up | ||
# drastically while the number of validations finished stays the | ||
# same. Thus expect an alert. | ||
exp_alerts: | ||
- exp_labels: | ||
severity: critical | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
exp_annotations: | ||
message: "The transaction pool size on node | ||
darwinia-abcdef01234-abcdef has been above 10_000 for more | ||
than 5 minutes." | ||
|
||
###################################################################### | ||
# Networking | ||
###################################################################### | ||
|
||
- eval_time: 3m # Values: 3 2 2 | ||
alertname: NumberOfPeersLow | ||
exp_alerts: | ||
- eval_time: 4m # Values: 2 2 2 | ||
alertname: NumberOfPeersLow | ||
exp_alerts: | ||
- exp_labels: | ||
severity: warning | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
exp_annotations: | ||
message: "The node darwinia-abcdef01234-abcdef has less | ||
than 3 peers for more than 3 minutes" | ||
|
||
- eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 | ||
alertname: NumberOfPeersLow | ||
exp_alerts: | ||
- exp_labels: | ||
severity: warning | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
exp_annotations: | ||
message: "The node darwinia-abcdef01234-abcdef has less | ||
than 3 peers for more than 3 minutes" | ||
- exp_labels: | ||
severity: critical | ||
pod: darwinia-abcdef01234-abcdef | ||
instance: darwinia-abcdef01234-abcdef | ||
job: darwinia | ||
exp_annotations: | ||
message: "The node darwinia-abcdef01234-abcdef has less | ||
than 3 peers for more than 15 minutes" |
139 changes: 139 additions & 0 deletions
139
.maintain/monitoring/alerting-rules/alerting-rules.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
groups: | ||
- name: darwinia.rules | ||
rules: | ||
|
||
############################################################################## | ||
# Block production | ||
############################################################################## | ||
|
||
- alert: BlockProductionSlow | ||
annotations: | ||
message: 'Best block on instance {{ $labels.instance }} increases by | ||
less than 1 per minute for more than 3 minutes.' | ||
expr: increase(darwinia_block_height{status="best"}[1m]) < 1 | ||
for: 3m | ||
labels: | ||
severity: warning | ||
- alert: BlockProductionSlow | ||
annotations: | ||
message: 'Best block on instance {{ $labels.instance }} increases by | ||
less than 1 per minute for more than 10 minutes.' | ||
expr: increase(darwinia_block_height{status="best"}[1m]) < 1 | ||
for: 10m | ||
labels: | ||
severity: critical | ||
|
||
############################################################################## | ||
# Block finalization | ||
############################################################################## | ||
|
||
- alert: BlockFinalizationSlow | ||
expr: increase(darwinia_block_height{status="finalized"}[1m]) < 1 | ||
for: 3m | ||
labels: | ||
severity: warning | ||
annotations: | ||
message: 'Finalized block on instance {{ $labels.instance }} increases by | ||
less than 1 per minute for more than 3 minutes.' | ||
- alert: BlockFinalizationSlow | ||
expr: increase(darwinia_block_height{status="finalized"}[1m]) < 1 | ||
for: 10m | ||
labels: | ||
severity: critical | ||
annotations: | ||
message: 'Finalized block on instance {{ $labels.instance }} increases by | ||
less than 1 per minute for more than 10 minutes.' | ||
- alert: BlockFinalizationLaggingBehind | ||
# Under the assumption of an average block production of 6 seconds, | ||
# "best" and "finalized" being more than 10 blocks apart would imply | ||
# more than a 1 minute delay between block production and finalization. | ||
expr: '(darwinia_block_height_number{status="best"} - ignoring(status) | ||
darwinia_block_height_number{status="finalized"}) > 10' | ||
for: 8m | ||
labels: | ||
severity: critical | ||
annotations: | ||
message: "Block finalization on instance {{ $labels.instance }} is behind | ||
block production by {{ $value }} for more than 8 minutes." | ||
|
||
############################################################################## | ||
# Transaction queue | ||
############################################################################## | ||
|
||
- alert: TransactionQueueSizeIncreasing | ||
expr: 'increase(darwinia_sub_txpool_validations_scheduled[5m]) - | ||
increase(darwinia_sub_txpool_validations_finished[5m]) > 0' | ||
for: 10m | ||
labels: | ||
severity: warning | ||
annotations: | ||
message: 'The transaction pool size on node {{ $labels.instance }} has | ||
been monotonically increasing for more than 10 minutes.' | ||
- alert: TransactionQueueSizeIncreasing | ||
expr: 'increase(darwinia_sub_txpool_validations_scheduled[5m]) - | ||
increase(darwinia_sub_txpool_validations_finished[5m]) > 0' | ||
for: 30m | ||
labels: | ||
severity: critical | ||
annotations: | ||
message: 'The transaction pool size on node {{ $labels.instance }} has | ||
been monotonically increasing for more than 30 minutes.' | ||
- alert: TransactionQueueSizeHigh | ||
expr: 'darwinia_sub_txpool_validations_scheduled - | ||
darwinia_sub_txpool_validations_finished > 10000' | ||
for: 5m | ||
labels: | ||
severity: critical | ||
annotations: | ||
message: 'The transaction pool size on node {{ $labels.instance }} has | ||
been above 10_000 for more than 5 minutes.' | ||
|
||
############################################################################## | ||
# Networking | ||
############################################################################## | ||
|
||
- alert: NumberOfPeersLow | ||
expr: darwinia_sub_libp2p_peers_count < 3 | ||
for: 3m | ||
labels: | ||
severity: warning | ||
annotations: | ||
message: 'The node {{ $labels.instance }} has less than 3 peers for more | ||
than 3 minutes' | ||
- alert: NumberOfPeersLow | ||
expr: darwinia_sub_libp2p_peers_count < 3 | ||
for: 15m | ||
labels: | ||
severity: critical | ||
annotations: | ||
message: 'The node {{ $labels.instance }} has less than 3 peers for more | ||
than 15 minutes' | ||
|
||
############################################################################## | ||
# System | ||
############################################################################## | ||
|
||
- alert: NumberOfFileDescriptorsHigh | ||
expr: 'node_filefd_allocated{domain=~"crab|darwinia"} > 10000' | ||
for: 3m | ||
labels: | ||
severity: warning | ||
annotations: | ||
message: 'The node {{ $labels.instance }} has more than 10_000 file | ||
descriptors allocated for more than 3 minutes' | ||
|
||
############################################################################## | ||
# Others | ||
############################################################################## | ||
|
||
- alert: AuthorityDiscoveryDiscoveryFailureHigh | ||
expr: 'darwinia_authority_discovery_handle_value_found_event_failure / | ||
ignoring(name) | ||
darwinia_authority_discovery_dht_event_received{name="value_found"} > 0.5' | ||
for: 2h | ||
labels: | ||
severity: warning | ||
annotations: | ||
message: 'Authority discovery on node {{ $labels.instance }} fails to | ||
process more than 50 % of the values found on the DHT for more than 2 | ||
hours.' |
Oops, something went wrong.