Skip to content

Commit

Permalink
store/tikv: add and fix some metrics for the new retry logic (#25123)
Browse files Browse the repository at this point in the history
  • Loading branch information
sticnarf authored Jun 7, 2021
1 parent 2017d2f commit 840494d
Show file tree
Hide file tree
Showing 11 changed files with 385 additions and 83 deletions.
236 changes: 228 additions & 8 deletions metrics/grafana/tidb.json
Original file line number Diff line number Diff line change
Expand Up @@ -5063,7 +5063,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "This metric shows OPS of async commit transactions.",
"description": "This metric shows the OPS of different types of transactions.",
"editable": true,
"error": false,
"fill": 1,
Expand All @@ -5079,13 +5079,13 @@
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"current": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
"values": true
},
"lines": true,
"linewidth": 1,
Expand All @@ -5103,6 +5103,13 @@
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(tidb_tikvclient_commit_txn_counter{tidb_cluster=\"$tidb_cluster\"}[1m])) by (type)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "2PC-{{type}}",
"refId": "C"
},
{
"expr": "sum(rate(tidb_tikvclient_async_commit_txn_counter{tidb_cluster=\"$tidb_cluster\"}[1m])) by (type)",
"format": "time_series",
Expand All @@ -5111,18 +5118,18 @@
"refId": "A"
},
{
"refId": "B",
"expr": "sum(rate(tidb_tikvclient_one_pc_txn_counter{tidb_cluster=\"$tidb_cluster\"}[1m])) by (type)",
"intervalFactor": 1,
"format": "time_series",
"legendFormat": "1PC-{{type}}"
"intervalFactor": 1,
"legendFormat": "1PC-{{type}}",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Async Commit Transaction Counter",
"title": "Transaction Types Per Second",
"tooltip": {
"msResolution": false,
"shared": true,
Expand Down Expand Up @@ -5159,6 +5166,123 @@
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "99th percentile of backoff count and duration in a transaction commit",
"editable": true,
"error": false,
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 8,
"x": 16,
"y": 46
},
"id": 224,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sort": "avg",
"sortDesc": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "null as zero",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "/count.*/",
"yaxis": 1
},
{
"alias": "/duration.*/",
"yaxis": 2
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, rate(tidb_tikvclient_txn_commit_backoff_count_bucket{tidb_cluster=\"$tidb_cluster\"}[1m]))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "count - {{instance}}",
"refId": "A",
"step": 40
},
{
"expr": "histogram_quantile(0.99, rate(tidb_tikvclient_txn_commit_backoff_seconds_bucket{tidb_cluster=\"$tidb_cluster\"}[1m]))",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "duration - {{instance}}",
"refId": "B",
"step": 40
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Transaction Commit .99 Backoff",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "none",
"label": "count",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "s",
"label": "duration",
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"repeat": null,
Expand Down Expand Up @@ -7003,6 +7127,102 @@
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "This metric shows the reasons of replica selector failure (which needs a backoff).",
"editable": true,
"error": false,
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 21
},
"id": 223,
"legend": {
"alignAsTable": true,
"avg": false,
"current": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null as zero",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(tidb_tikvclient_replica_selector_failure_counter{tidb_cluster=\"$tidb_cluster\"}[1m])) by (type)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "{{type}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Replica Selector Failure Per Second",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"repeat": null,
Expand Down
27 changes: 16 additions & 11 deletions store/tikv/2pc.go
Original file line number Diff line number Diff line change
Expand Up @@ -1079,11 +1079,13 @@ func (c *twoPhaseCommitter) execute(ctx context.Context) (err error) {

commitDetail := c.getDetail()
commitDetail.PrewriteTime = time.Since(start)
// TODO(youjiali1995): Record the backoff time of the last finished batch. It doesn't make sense to aggregate all batches'.
if bo.GetTotalSleep() > 0 {
atomic.AddInt64(&commitDetail.CommitBackoffTime, int64(bo.GetTotalSleep())*int64(time.Millisecond))
boSleep := int64(bo.GetTotalSleep()) * int64(time.Millisecond)
commitDetail.Mu.Lock()
commitDetail.Mu.BackoffTypes = append(commitDetail.Mu.BackoffTypes, bo.GetTypes()...)
if boSleep > commitDetail.Mu.CommitBackoffTime {
commitDetail.Mu.CommitBackoffTime = boSleep
commitDetail.Mu.BackoffTypes = bo.GetTypes()
}
commitDetail.Mu.Unlock()
}

Expand Down Expand Up @@ -1242,8 +1244,8 @@ func (c *twoPhaseCommitter) commitTxn(ctx context.Context, commitDetail *util.Co
err := c.commitMutations(commitBo, c.mutations)
commitDetail.CommitTime = time.Since(start)
if commitBo.GetTotalSleep() > 0 {
atomic.AddInt64(&commitDetail.CommitBackoffTime, int64(commitBo.GetTotalSleep())*int64(time.Millisecond))
commitDetail.Mu.Lock()
commitDetail.Mu.CommitBackoffTime += int64(commitBo.GetTotalSleep()) * int64(time.Millisecond)
commitDetail.Mu.BackoffTypes = append(commitDetail.Mu.BackoffTypes, commitBo.GetTypes()...)
commitDetail.Mu.Unlock()
}
Expand Down Expand Up @@ -1636,17 +1638,20 @@ func (batchExe *batchExecutor) startWorker(exitCh chan struct{}, ch chan error,
singleBatchBackoffer, singleBatchCancel = batchExe.backoffer.Fork()
defer singleBatchCancel()
}
beforeSleep := singleBatchBackoffer.GetTotalSleep()
ch <- batchExe.action.handleSingleBatch(batchExe.committer, singleBatchBackoffer, batch)
commitDetail := batchExe.committer.getDetail()
if commitDetail != nil { // lock operations of pessimistic-txn will let commitDetail be nil
if delta := singleBatchBackoffer.GetTotalSleep() - beforeSleep; delta > 0 {
atomic.AddInt64(&commitDetail.CommitBackoffTime, int64(singleBatchBackoffer.GetTotalSleep()-beforeSleep)*int64(time.Millisecond))
commitDetail.Mu.Lock()
commitDetail.Mu.BackoffTypes = append(commitDetail.Mu.BackoffTypes, singleBatchBackoffer.GetTypes()...)
commitDetail.Mu.Unlock()
// For prewrite, we record the max backoff time
if _, ok := batchExe.action.(actionPrewrite); ok {
commitDetail.Mu.Lock()
boSleep := int64(singleBatchBackoffer.GetTotalSleep()) * int64(time.Millisecond)
if boSleep > commitDetail.Mu.CommitBackoffTime {
commitDetail.Mu.CommitBackoffTime = boSleep
commitDetail.Mu.BackoffTypes = singleBatchBackoffer.GetTypes()
}
commitDetail.Mu.Unlock()
}
// Backoff time in the 2nd phase of a non-async-commit txn is added
// in the commitTxn method, so we don't add it here.
}()
} else {
logutil.Logger(batchExe.backoffer.GetCtx()).Info("break startWorker",
Expand Down
Loading

0 comments on commit 840494d

Please sign in to comment.