Skip to content
This repository has been archived by the owner on Mar 13, 2023. It is now read-only.

Commit

Permalink
Substrate v2.0.0 (#323)
Browse files Browse the repository at this point in the history
* bump: dependencies

* companion: for #6789

* companion: for #7038

* companion: for #6961

* companion: for #6984

* companion: for #7007

* companion: for #7103

* companion: for #6825

* companion: for #7128

* companion: for #7131

* companion: for #7138

* bump: dependencies

* companion: for #7148

* companion: for #7180

* companion: for #7232

* companion: for #7237

* fix: compile

* update: format

* companion: for #7058

* companion: for #7136

* companion: for #7155

* update: format

* companion: for #7161

* companion: for #7176

* bump: dependencies

* companion: for #6685

* companion: for #7111

* companion: for #7214

* companion: for #7215

* companion: for #5715

* companion: for #6685 part2

* update: runtime

* update: darwinia-cli

* companion: for #7039

* companion: for #6215

* companion: for #6948

* rename

* todo: new service with #6999

* bump: dependencies
  • Loading branch information
AurevoirXavier authored Oct 14, 2020
1 parent fc3c3f3 commit 4e3ecee
Show file tree
Hide file tree
Showing 82 changed files with 8,388 additions and 3,499 deletions.
239 changes: 239 additions & 0 deletions .maintain/monitoring/alerting-rules/alerting-rule-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
rule_files:
- /dev/stdin

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: 'darwinia_sub_libp2p_peers_count{
job="darwinia",
pod="darwinia-abcdef01234-abcdef",
instance="darwinia-abcdef01234-abcdef",
}'
values: '3 2+0x4 1+0x9' # 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1

- series: 'darwinia_sub_txpool_validations_scheduled{
job="darwinia",
pod="darwinia-abcdef01234-abcdef",
instance="darwinia-abcdef01234-abcdef",
}'
values: '11+1x10 22+2x30 10043x5'

- series: 'darwinia_sub_txpool_validations_finished{
job="darwinia",
pod="darwinia-abcdef01234-abcdef",
instance="darwinia-abcdef01234-abcdef",
}'
values: '0+1x42 42x5'

- series: 'darwinia_block_height{
status="best", job="darwinia",
pod="darwinia-abcdef01234-abcdef",
instance="darwinia-abcdef01234-abcdef",
}'
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...

- series: 'darwinia_block_height{
status="finalized",
job="darwinia",
pod="darwinia-abcdef01234-abcdef",
instance="darwinia-abcdef01234-abcdef",
}'
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...

alert_rule_test:

######################################################################
# Block production
######################################################################

- eval_time: 6m
alertname: BlockProductionSlow
exp_alerts:
- eval_time: 7m
alertname: BlockProductionSlow
exp_alerts:
- exp_labels:
severity: warning
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
status: best
exp_annotations:
message: "Best block on instance
darwinia-abcdef01234-abcdef increases by less than 1 per
minute for more than 3 minutes."

- eval_time: 14m
alertname: BlockProductionSlow
exp_alerts:
- exp_labels:
severity: warning
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
status: best
exp_annotations:
message: "Best block on instance
darwinia-abcdef01234-abcdef increases by less than 1 per
minute for more than 3 minutes."
- exp_labels:
severity: critical
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
status: best
exp_annotations:
message: "Best block on instance
darwinia-abcdef01234-abcdef increases by less than 1 per
minute for more than 10 minutes."

######################################################################
# Block finalization
######################################################################

- eval_time: 6m
alertname: BlockFinalizationSlow
exp_alerts:
- eval_time: 7m
alertname: BlockFinalizationSlow
exp_alerts:
- exp_labels:
severity: warning
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
status: finalized
exp_annotations:
message: "Finalized block on instance
darwinia-abcdef01234-abcdef increases by less than 1 per
minute for more than 3 minutes."

- eval_time: 14m
alertname: BlockFinalizationSlow
exp_alerts:
- exp_labels:
severity: warning
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
status: finalized
exp_annotations:
message: "Finalized block on instance
darwinia-abcdef01234-abcdef increases by less than 1 per
minute for more than 3 minutes."
- exp_labels:
severity: critical
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
status: finalized
exp_annotations:
message: "Finalized block on instance
darwinia-abcdef01234-abcdef increases by less than 1 per
minute for more than 10 minutes."

######################################################################
# Transaction queue
######################################################################

- eval_time: 11m
alertname: TransactionQueueSizeIncreasing
# Number of validations scheduled and finished both grow at a rate
# of 1 in the first 10 minutes, thereby the queue is not increasing
# in size, thus don't expect an alert.
exp_alerts:
- eval_time: 22m
alertname: TransactionQueueSizeIncreasing
# Number of validations scheduled is growing twice as fast as the
# number of validations finished after minute 10. Thus expect
# warning alert after 20 minutes.
exp_alerts:
- exp_labels:
severity: warning
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
exp_annotations:
message: "The transaction pool size on node
darwinia-abcdef01234-abcdef has been monotonically
increasing for more than 10 minutes."
- eval_time: 43m
alertname: TransactionQueueSizeIncreasing
# Number of validations scheduled is growing twice as fast as the
# number of validations finished after minute 10. Thus expect
# both warning and critical alert after 40 minutes.
exp_alerts:
- exp_labels:
severity: warning
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
exp_annotations:
message: "The transaction pool size on node
darwinia-abcdef01234-abcdef has been monotonically
increasing for more than 10 minutes."
- exp_labels:
severity: critical
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
exp_annotations:
message: "The transaction pool size on node
darwinia-abcdef01234-abcdef has been monotonically
increasing for more than 30 minutes."
- eval_time: 49m
alertname: TransactionQueueSizeHigh
# After minute 43 the number of validations scheduled jumps up
# drastically while the number of validations finished stays the
# same. Thus expect an alert.
exp_alerts:
- exp_labels:
severity: critical
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
exp_annotations:
message: "The transaction pool size on node
darwinia-abcdef01234-abcdef has been above 10_000 for more
than 5 minutes."

######################################################################
# Networking
######################################################################

- eval_time: 3m # Values: 3 2 2
alertname: NumberOfPeersLow
exp_alerts:
- eval_time: 4m # Values: 2 2 2
alertname: NumberOfPeersLow
exp_alerts:
- exp_labels:
severity: warning
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
exp_annotations:
message: "The node darwinia-abcdef01234-abcdef has less
than 3 peers for more than 3 minutes"

- eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
alertname: NumberOfPeersLow
exp_alerts:
- exp_labels:
severity: warning
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
exp_annotations:
message: "The node darwinia-abcdef01234-abcdef has less
than 3 peers for more than 3 minutes"
- exp_labels:
severity: critical
pod: darwinia-abcdef01234-abcdef
instance: darwinia-abcdef01234-abcdef
job: darwinia
exp_annotations:
message: "The node darwinia-abcdef01234-abcdef has less
than 3 peers for more than 15 minutes"
139 changes: 139 additions & 0 deletions .maintain/monitoring/alerting-rules/alerting-rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
groups:
- name: darwinia.rules
rules:

##############################################################################
# Block production
##############################################################################

- alert: BlockProductionSlow
annotations:
message: 'Best block on instance {{ $labels.instance }} increases by
less than 1 per minute for more than 3 minutes.'
expr: increase(darwinia_block_height{status="best"}[1m]) < 1
for: 3m
labels:
severity: warning
- alert: BlockProductionSlow
annotations:
message: 'Best block on instance {{ $labels.instance }} increases by
less than 1 per minute for more than 10 minutes.'
expr: increase(darwinia_block_height{status="best"}[1m]) < 1
for: 10m
labels:
severity: critical

##############################################################################
# Block finalization
##############################################################################

- alert: BlockFinalizationSlow
expr: increase(darwinia_block_height{status="finalized"}[1m]) < 1
for: 3m
labels:
severity: warning
annotations:
message: 'Finalized block on instance {{ $labels.instance }} increases by
less than 1 per minute for more than 3 minutes.'
- alert: BlockFinalizationSlow
expr: increase(darwinia_block_height{status="finalized"}[1m]) < 1
for: 10m
labels:
severity: critical
annotations:
message: 'Finalized block on instance {{ $labels.instance }} increases by
less than 1 per minute for more than 10 minutes.'
- alert: BlockFinalizationLaggingBehind
# Under the assumption of an average block production of 6 seconds,
# "best" and "finalized" being more than 10 blocks apart would imply
# more than a 1 minute delay between block production and finalization.
expr: '(darwinia_block_height_number{status="best"} - ignoring(status)
darwinia_block_height_number{status="finalized"}) > 10'
for: 8m
labels:
severity: critical
annotations:
message: "Block finalization on instance {{ $labels.instance }} is behind
block production by {{ $value }} for more than 8 minutes."

##############################################################################
# Transaction queue
##############################################################################

- alert: TransactionQueueSizeIncreasing
expr: 'increase(darwinia_sub_txpool_validations_scheduled[5m]) -
increase(darwinia_sub_txpool_validations_finished[5m]) > 0'
for: 10m
labels:
severity: warning
annotations:
message: 'The transaction pool size on node {{ $labels.instance }} has
been monotonically increasing for more than 10 minutes.'
- alert: TransactionQueueSizeIncreasing
expr: 'increase(darwinia_sub_txpool_validations_scheduled[5m]) -
increase(darwinia_sub_txpool_validations_finished[5m]) > 0'
for: 30m
labels:
severity: critical
annotations:
message: 'The transaction pool size on node {{ $labels.instance }} has
been monotonically increasing for more than 30 minutes.'
- alert: TransactionQueueSizeHigh
expr: 'darwinia_sub_txpool_validations_scheduled -
darwinia_sub_txpool_validations_finished > 10000'
for: 5m
labels:
severity: critical
annotations:
message: 'The transaction pool size on node {{ $labels.instance }} has
been above 10_000 for more than 5 minutes.'

##############################################################################
# Networking
##############################################################################

- alert: NumberOfPeersLow
expr: darwinia_sub_libp2p_peers_count < 3
for: 3m
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has less than 3 peers for more
than 3 minutes'
- alert: NumberOfPeersLow
expr: darwinia_sub_libp2p_peers_count < 3
for: 15m
labels:
severity: critical
annotations:
message: 'The node {{ $labels.instance }} has less than 3 peers for more
than 15 minutes'

##############################################################################
# System
##############################################################################

- alert: NumberOfFileDescriptorsHigh
expr: 'node_filefd_allocated{domain=~"crab|darwinia"} > 10000'
for: 3m
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has more than 10_000 file
descriptors allocated for more than 3 minutes'

##############################################################################
# Others
##############################################################################

- alert: AuthorityDiscoveryDiscoveryFailureHigh
expr: 'darwinia_authority_discovery_handle_value_found_event_failure /
ignoring(name)
darwinia_authority_discovery_dht_event_received{name="value_found"} > 0.5'
for: 2h
labels:
severity: warning
annotations:
message: 'Authority discovery on node {{ $labels.instance }} fails to
process more than 50 % of the values found on the DHT for more than 2
hours.'
Loading

0 comments on commit 4e3ecee

Please sign in to comment.