Skip to content

Commit

Permalink
Merge #86415
Browse files Browse the repository at this point in the history
86415: insights: identify statements with high retry counts r=matthewtodd a=matthewtodd

Addresses #85827.

This change undoes the previous "concerns" concept from #85345 and
replaces it with a "problems" field.[^1][^2] We then mark slow statements
retried more than 5 times as having a "HighRetryCount" problem.[^3] Slow
statements without any specifically identified problems will have
"Unknown" in their list of problems. We will add support for further
problem detection in future commits.

[^1]: Every execution insight we offer happens because a statement was
  slow, so the "SlowExecution" concern was the only one we'd ever have.

[^2]: The protocol buffer changes are safe because we haven't deployed
  this code anywhere yet.

[^3]: This threshold may be configured via the
  `sql.insights.high_retry_count.threshold` cluster setting.

Release justification: Category 2: Bug fixes and low-risk updates to new
functionality

Release note (ops change): The new
`sql.insights.high_retry_count.threshold` cluster setting may be used to
configure how many times a slow statement (as identified by the
execution insights system) must have been retried to be marked as having
a high retry count.

Co-authored-by: Matthew Todd <todd@cockroachlabs.com>
  • Loading branch information
craig[bot] and matthewtodd committed Aug 19, 2022
2 parents b6f99d5 + 493a43b commit 8eabfb7
Show file tree
Hide file tree
Showing 13 changed files with 200 additions and 87 deletions.
1 change: 1 addition & 0 deletions docs/generated/settings/settings-for-tenants.txt
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ sql.distsql.temp_storage.workmem byte size 64 MiB maximum amount of memory in by
sql.guardrails.max_row_size_err byte size 512 MiB maximum size of row (or column family if multiple column families are in use) that SQL can write to the database, above which an error is returned; use 0 to disable
sql.guardrails.max_row_size_log byte size 64 MiB maximum size of row (or column family if multiple column families are in use) that SQL can write to the database, above which an event is logged to SQL_PERF (or SQL_INTERNAL_PERF if the mutating statement was internal); use 0 to disable
sql.insights.execution_insights_capacity integer 1000 the size of the per-node store of execution insights
sql.insights.high_retry_count.threshold integer 10 the number of retries a slow statement must have undergone for its high retry count to be highlighted as a potential problem
sql.insights.latency_threshold duration 100ms amount of time after which an executing statement is considered slow. Use 0 to disable.
sql.log.slow_query.experimental_full_table_scans.enabled boolean false when set to true, statements that perform a full table/index scan will be logged to the slow query log even if they do not meet the latency threshold. Must have the slow query log enabled for this setting to have any effect.
sql.log.slow_query.internal_queries.enabled boolean false when set to true, internal queries which exceed the slow query log threshold are logged to a separate log. Must have the slow query log enabled for this setting to have any effect.
Expand Down
1 change: 1 addition & 0 deletions docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@
<tr><td><code>sql.guardrails.max_row_size_log</code></td><td>byte size</td><td><code>64 MiB</code></td><td>maximum size of row (or column family if multiple column families are in use) that SQL can write to the database, above which an event is logged to SQL_PERF (or SQL_INTERNAL_PERF if the mutating statement was internal); use 0 to disable</td></tr>
<tr><td><code>sql.hash_sharded_range_pre_split.max</code></td><td>integer</td><td><code>16</code></td><td>max pre-split ranges to have when adding hash sharded index to an existing table</td></tr>
<tr><td><code>sql.insights.execution_insights_capacity</code></td><td>integer</td><td><code>1000</code></td><td>the size of the per-node store of execution insights</td></tr>
<tr><td><code>sql.insights.high_retry_count.threshold</code></td><td>integer</td><td><code>10</code></td><td>the number of retries a slow statement must have undergone for its high retry count to be highlighted as a potential problem</td></tr>
<tr><td><code>sql.insights.latency_threshold</code></td><td>duration</td><td><code>100ms</code></td><td>amount of time after which an executing statement is considered slow. Use 0 to disable.</td></tr>
<tr><td><code>sql.log.slow_query.experimental_full_table_scans.enabled</code></td><td>boolean</td><td><code>false</code></td><td>when set to true, statements that perform a full table/index scan will be logged to the slow query log even if they do not meet the latency threshold. Must have the slow query log enabled for this setting to have any effect.</td></tr>
<tr><td><code>sql.log.slow_query.internal_queries.enabled</code></td><td>boolean</td><td><code>false</code></td><td>when set to true, internal queries which exceed the slow query log threshold are logged to a separate log. Must have the slow query log enabled for this setting to have any effect.</td></tr>
Expand Down
9 changes: 9 additions & 0 deletions pkg/sql/crdb_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -6332,6 +6332,7 @@ CREATE TABLE crdb_internal.%s (
txn_fingerprint_id BYTES NOT NULL,
stmt_id STRING NOT NULL,
stmt_fingerprint_id BYTES NOT NULL,
problems STRING[] NOT NULL,
query STRING NOT NULL,
status STRING NOT NULL,
start_time TIMESTAMP NOT NULL,
Expand Down Expand Up @@ -6390,6 +6391,13 @@ func populateExecutionInsights(
return
}
for _, insight := range response.Insights {
problems := tree.NewDArray(types.String)
for _, problem := range insight.Problems {
if errProblem := problems.Append(tree.NewDString(problem.String())); err != nil {
err = errors.CombineErrors(err, errProblem)
}
}

startTimestamp, errTimestamp := tree.MakeDTimestamp(insight.Statement.StartTime, time.Nanosecond)
if errTimestamp != nil {
err = errors.CombineErrors(err, errTimestamp)
Expand Down Expand Up @@ -6436,6 +6444,7 @@ func populateExecutionInsights(
tree.NewDBytes(tree.DBytes(sqlstatsutil.EncodeUint64ToBytes(uint64(insight.Transaction.FingerprintID)))),
tree.NewDString(hex.EncodeToString(insight.Statement.ID.GetBytes())),
tree.NewDBytes(tree.DBytes(sqlstatsutil.EncodeUint64ToBytes(uint64(insight.Statement.FingerprintID)))),
problems,
tree.NewDString(insight.Statement.Query),
tree.NewDString(insight.Statement.Status),
startTimestamp,
Expand Down
4 changes: 4 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/create_statements
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ CREATE TABLE crdb_internal.cluster_execution_insights (
txn_fingerprint_id BYTES NOT NULL,
stmt_id STRING NOT NULL,
stmt_fingerprint_id BYTES NOT NULL,
problems STRING[] NOT NULL,
query STRING NOT NULL,
status STRING NOT NULL,
start_time TIMESTAMP NOT NULL,
Expand All @@ -272,6 +273,7 @@ CREATE TABLE crdb_internal.cluster_execution_insights (
txn_fingerprint_id BYTES NOT NULL,
stmt_id STRING NOT NULL,
stmt_fingerprint_id BYTES NOT NULL,
problems STRING[] NOT NULL,
query STRING NOT NULL,
status STRING NOT NULL,
start_time TIMESTAMP NOT NULL,
Expand Down Expand Up @@ -963,6 +965,7 @@ CREATE TABLE crdb_internal.node_execution_insights (
txn_fingerprint_id BYTES NOT NULL,
stmt_id STRING NOT NULL,
stmt_fingerprint_id BYTES NOT NULL,
problems STRING[] NOT NULL,
query STRING NOT NULL,
status STRING NOT NULL,
start_time TIMESTAMP NOT NULL,
Expand All @@ -986,6 +989,7 @@ CREATE TABLE crdb_internal.node_execution_insights (
txn_fingerprint_id BYTES NOT NULL,
stmt_id STRING NOT NULL,
stmt_fingerprint_id BYTES NOT NULL,
problems STRING[] NOT NULL,
query STRING NOT NULL,
status STRING NOT NULL,
start_time TIMESTAMP NOT NULL,
Expand Down
2 changes: 2 additions & 0 deletions pkg/sql/sqlstats/insights/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ go_library(
"detector.go",
"ingester.go",
"insights.go",
"problems.go",
"registry.go",
],
embed = [":insights_go_proto"],
Expand All @@ -35,6 +36,7 @@ go_test(
"detector_test.go",
"ingester_test.go",
"insights_test.go",
"problems_test.go",
"registry_test.go",
],
embed = [":insights"],
Expand Down
40 changes: 14 additions & 26 deletions pkg/sql/sqlstats/insights/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import (

type detector interface {
enabled() bool
examine(*Statement) []Concern
isSlow(*Statement) bool
}

var _ detector = &compositeDetector{}
Expand All @@ -40,20 +40,14 @@ func (a compositeDetector) enabled() bool {
return false
}

func (a compositeDetector) examine(statement *Statement) (concerns []Concern) {
set := make(map[Concern]struct{})

func (a compositeDetector) isSlow(statement *Statement) bool {
// Because some detectors may need to observe all statements to build up
// their baseline sense of what "normal" is, we avoid short-circuiting.
result := false
for _, d := range a.detectors {
for _, concern := range d.examine(statement) {
set[concern] = struct{}{}
}
result = d.isSlow(statement) || result
}

for concern := range set {
concerns = append(concerns, concern)
}

return concerns
return result
}

var desiredQuantiles = map[float64]float64{0.5: 0.05, 0.99: 0.001}
Expand All @@ -74,23 +68,21 @@ func (d anomalyDetector) enabled() bool {
return AnomalyDetectionEnabled.Get(&d.settings.SV)
}

func (d *anomalyDetector) examine(stmt *Statement) (concerns []Concern) {
func (d *anomalyDetector) isSlow(stmt *Statement) (decision bool) {
if !d.enabled() {
return concerns
return
}

d.withFingerprintLatencySummary(stmt, func(latencySummary *quantile.Stream) {
latencySummary.Insert(stmt.LatencyInSeconds)
p50 := latencySummary.Query(0.5)
p99 := latencySummary.Query(0.99)
if stmt.LatencyInSeconds >= p99 &&
decision = stmt.LatencyInSeconds >= p99 &&
stmt.LatencyInSeconds >= 2*p50 &&
stmt.LatencyInSeconds >= AnomalyDetectionLatencyThreshold.Get(&d.settings.SV).Seconds() {
concerns = append(concerns, Concern_SlowExecution)
}
stmt.LatencyInSeconds >= AnomalyDetectionLatencyThreshold.Get(&d.settings.SV).Seconds()
})

return concerns
return
}

func (d *anomalyDetector) withFingerprintLatencySummary(
Expand Down Expand Up @@ -146,10 +138,6 @@ func (l latencyThresholdDetector) enabled() bool {
return LatencyThreshold.Get(&l.st.SV) > 0
}

func (l latencyThresholdDetector) examine(s *Statement) (concerns []Concern) {
if l.enabled() && s.LatencyInSeconds >= LatencyThreshold.Get(&l.st.SV).Seconds() {
concerns = append(concerns, Concern_SlowExecution)
}

return concerns
func (l latencyThresholdDetector) isSlow(s *Statement) bool {
return l.enabled() && s.LatencyInSeconds >= LatencyThreshold.Get(&l.st.SV).Seconds()
}
80 changes: 39 additions & 41 deletions pkg/sql/sqlstats/insights/detector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,38 +37,32 @@ func TestAnyDetector(t *testing.T) {
require.True(t, detector.enabled())
})

t.Run("examine is nil without any detectors", func(t *testing.T) {
t.Run("isSlow is false without any detectors", func(t *testing.T) {
detector := &compositeDetector{}
require.Empty(t, detector.examine(&Statement{}))
require.False(t, detector.isSlow(&Statement{}))
})

t.Run("examine is nil without any concerned detectors", func(t *testing.T) {
t.Run("isSlow is false without any concerned detectors", func(t *testing.T) {
detector := &compositeDetector{[]detector{&fakeDetector{}, &fakeDetector{}}}
require.Empty(t, detector.examine(&Statement{}))
require.False(t, detector.isSlow(&Statement{}))
})

t.Run("examine combines detector concerns (first)", func(t *testing.T) {
detector := &compositeDetector{[]detector{
&fakeDetector{stubExamine: []Concern{Concern_SlowExecution}},
&fakeDetector{},
}}
require.Equal(t, []Concern{Concern_SlowExecution}, detector.examine(&Statement{}))
t.Run("isSlow is true with at least one concerned detector", func(t *testing.T) {
detector := &compositeDetector{[]detector{&fakeDetector{stubIsSlow: true}, &fakeDetector{}}}
require.True(t, detector.isSlow(&Statement{}))
})

t.Run("examine combines detector concerns (second)", func(t *testing.T) {
detector := &compositeDetector{[]detector{
&fakeDetector{},
&fakeDetector{stubExamine: []Concern{Concern_SlowExecution}},
}}
require.Equal(t, []Concern{Concern_SlowExecution}, detector.examine(&Statement{}))
})
t.Run("isSlow consults all detectors without short-circuiting", func(t *testing.T) {
// Detector implementations may wish to observe all statements, to
// build up their baseline sense of what "usual" is. To short-circuit
// would deny them that chance.
d1 := &fakeDetector{stubIsSlow: true}
d2 := &fakeDetector{stubIsSlow: true}

t.Run("examine uniqs detector concerns", func(t *testing.T) {
detector := &compositeDetector{[]detector{
&fakeDetector{stubExamine: []Concern{Concern_SlowExecution}},
&fakeDetector{stubExamine: []Concern{Concern_SlowExecution}},
}}
require.Equal(t, []Concern{Concern_SlowExecution}, detector.examine(&Statement{}))
detector := &compositeDetector{[]detector{d1, d2}}
detector.isSlow(&Statement{})
require.True(t, d1.isSlowCalled, "the first detector should be consulted")
require.True(t, d2.isSlowCalled, "the second detector should be consulted")
})
}

Expand All @@ -85,7 +79,7 @@ func TestLatencyQuantileDetector(t *testing.T) {
require.True(t, d.enabled())
})

t.Run("examine", func(t *testing.T) {
t.Run("isSlow", func(t *testing.T) {
ctx := context.Background()
st := cluster.MakeTestingClusterSettings()
AnomalyDetectionEnabled.Override(ctx, &st.SV, true)
Expand All @@ -95,28 +89,30 @@ func TestLatencyQuantileDetector(t *testing.T) {
name string
seedLatency time.Duration
candidateLatency time.Duration
concerns []Concern
isSlow bool
}{{
name: "false with normal latency",
seedLatency: 100 * time.Millisecond,
candidateLatency: 100 * time.Millisecond,
isSlow: false,
}, {
name: "true with higher latency",
seedLatency: 100 * time.Millisecond,
candidateLatency: 200 * time.Millisecond,
concerns: []Concern{Concern_SlowExecution},
isSlow: true,
}, {
name: "false with higher latency under interesting threshold",
seedLatency: 10 * time.Millisecond,
candidateLatency: 20 * time.Millisecond,
isSlow: false,
}}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
d := newAnomalyDetector(st, NewMetrics())
for i := 0; i < 1000; i++ {
d.examine(&Statement{LatencyInSeconds: test.seedLatency.Seconds()})
d.isSlow(&Statement{LatencyInSeconds: test.seedLatency.Seconds()})
}
require.Equal(t, test.concerns, d.examine(&Statement{LatencyInSeconds: test.candidateLatency.Seconds()}))
require.Equal(t, test.isSlow, d.isSlow(&Statement{LatencyInSeconds: test.candidateLatency.Seconds()}))
})
}
})
Expand Down Expand Up @@ -173,7 +169,7 @@ func TestLatencyQuantileDetector(t *testing.T) {
d := newAnomalyDetector(st, metrics)
// Show the detector `test.fingerprints` distinct fingerprints.
for i := 0; i < test.fingerprints; i++ {
d.examine(&Statement{
d.isSlow(&Statement{
LatencyInSeconds: AnomalyDetectionLatencyThreshold.Get(&st.SV).Seconds(),
FingerprintID: roachpb.StmtFingerprintID(i),
})
Expand All @@ -192,7 +188,7 @@ func BenchmarkLatencyQuantileDetector(b *testing.B) {
AnomalyDetectionEnabled.Override(context.Background(), &settings.SV, true)
d := newAnomalyDetector(settings, NewMetrics())
for i := 0; i < b.N; i++ {
d.examine(&Statement{
d.isSlow(&Statement{
LatencyInSeconds: random.Float64(),
})
}
Expand All @@ -213,39 +209,41 @@ func TestLatencyThresholdDetector(t *testing.T) {
require.True(t, detector.enabled())
})

t.Run("examine nil when disabled", func(t *testing.T) {
t.Run("isSlow false when disabled", func(t *testing.T) {
st := cluster.MakeTestingClusterSettings()
LatencyThreshold.Override(context.Background(), &st.SV, 0)
detector := latencyThresholdDetector{st: st}
require.Empty(t, detector.examine(&Statement{LatencyInSeconds: 1}))
require.False(t, detector.isSlow(&Statement{LatencyInSeconds: 1}))
})

t.Run("examine nil when fast enough", func(t *testing.T) {
t.Run("isSlow false when fast enough", func(t *testing.T) {
st := cluster.MakeTestingClusterSettings()
LatencyThreshold.Override(context.Background(), &st.SV, 1*time.Second)
detector := latencyThresholdDetector{st: st}
require.Empty(t, detector.examine(&Statement{LatencyInSeconds: 0.5}))
require.False(t, detector.isSlow(&Statement{LatencyInSeconds: 0.5}))
})

t.Run("examine slow beyond threshold", func(t *testing.T) {
t.Run("isSlow true beyond threshold", func(t *testing.T) {
st := cluster.MakeTestingClusterSettings()
LatencyThreshold.Override(context.Background(), &st.SV, 1*time.Second)
detector := latencyThresholdDetector{st: st}
require.Equal(t, []Concern{Concern_SlowExecution}, detector.examine(&Statement{LatencyInSeconds: 1}))
require.True(t, detector.isSlow(&Statement{LatencyInSeconds: 1}))
})
}

type fakeDetector struct {
stubEnabled bool
stubExamine []Concern
stubEnabled bool
stubIsSlow bool
isSlowCalled bool
}

func (f fakeDetector) enabled() bool {
func (f *fakeDetector) enabled() bool {
return f.stubEnabled
}

func (f fakeDetector) examine(_ *Statement) []Concern {
return f.stubExamine
func (f *fakeDetector) isSlow(*Statement) bool {
f.isSlowCalled = true
return f.stubIsSlow
}

var _ detector = &fakeDetector{}
10 changes: 10 additions & 0 deletions pkg/sql/sqlstats/insights/insights.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,16 @@ var AnomalyDetectionMemoryLimit = settings.RegisterByteSizeSetting(
1024*1024,
)

// HighRetryCountThreshold sets the number of times a slow statement must have
// been retried to be marked as having a high retry count.
var HighRetryCountThreshold = settings.RegisterIntSetting(
settings.TenantWritable,
"sql.insights.high_retry_count.threshold",
"the number of retries a slow statement must have undergone for its high retry count to be highlighted as a potential problem",
10,
settings.NonNegativeInt,
).WithPublic()

// Metrics holds running measurements of various outliers-related runtime stats.
type Metrics struct {
// Fingerprints measures the number of statement fingerprints being monitored for
Expand Down
Loading

0 comments on commit 8eabfb7

Please sign in to comment.