cockroachdb · craig · Aug 19, 2022 · Aug 18, 2022
diff --git a/docs/generated/settings/settings-for-tenants.txt b/docs/generated/settings/settings-for-tenants.txt
@@ -233,6 +233,7 @@ sql.distsql.temp_storage.workmem	byte size	64 MiB	maximum amount of memory in by
 sql.guardrails.max_row_size_err	byte size	512 MiB	maximum size of row (or column family if multiple column families are in use) that SQL can write to the database, above which an error is returned; use 0 to disable
 sql.guardrails.max_row_size_log	byte size	64 MiB	maximum size of row (or column family if multiple column families are in use) that SQL can write to the database, above which an event is logged to SQL_PERF (or SQL_INTERNAL_PERF if the mutating statement was internal); use 0 to disable
 sql.insights.execution_insights_capacity	integer	1000	the size of the per-node store of execution insights
+sql.insights.high_retry_count.threshold	integer	10	the number of retries a slow statement must have undergone for its high retry count to be highlighted as a potential problem
 sql.insights.latency_threshold	duration	100ms	amount of time after which an executing statement is considered slow. Use 0 to disable.
 sql.log.slow_query.experimental_full_table_scans.enabled	boolean	false	when set to true, statements that perform a full table/index scan will be logged to the slow query log even if they do not meet the latency threshold. Must have the slow query log enabled for this setting to have any effect.
 sql.log.slow_query.internal_queries.enabled	boolean	false	when set to true, internal queries which exceed the slow query log threshold are logged to a separate log. Must have the slow query log enabled for this setting to have any effect.

diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html
@@ -164,6 +164,7 @@
 <tr><td><code>sql.guardrails.max_row_size_log</code></td><td>byte size</td><td><code>64 MiB</code></td><td>maximum size of row (or column family if multiple column families are in use) that SQL can write to the database, above which an event is logged to SQL_PERF (or SQL_INTERNAL_PERF if the mutating statement was internal); use 0 to disable</td></tr>
 <tr><td><code>sql.hash_sharded_range_pre_split.max</code></td><td>integer</td><td><code>16</code></td><td>max pre-split ranges to have when adding hash sharded index to an existing table</td></tr>
 <tr><td><code>sql.insights.execution_insights_capacity</code></td><td>integer</td><td><code>1000</code></td><td>the size of the per-node store of execution insights</td></tr>
+<tr><td><code>sql.insights.high_retry_count.threshold</code></td><td>integer</td><td><code>10</code></td><td>the number of retries a slow statement must have undergone for its high retry count to be highlighted as a potential problem</td></tr>
 <tr><td><code>sql.insights.latency_threshold</code></td><td>duration</td><td><code>100ms</code></td><td>amount of time after which an executing statement is considered slow. Use 0 to disable.</td></tr>
 <tr><td><code>sql.log.slow_query.experimental_full_table_scans.enabled</code></td><td>boolean</td><td><code>false</code></td><td>when set to true, statements that perform a full table/index scan will be logged to the slow query log even if they do not meet the latency threshold. Must have the slow query log enabled for this setting to have any effect.</td></tr>
 <tr><td><code>sql.log.slow_query.internal_queries.enabled</code></td><td>boolean</td><td><code>false</code></td><td>when set to true, internal queries which exceed the slow query log threshold are logged to a separate log. Must have the slow query log enabled for this setting to have any effect.</td></tr>

@@ -6338,6 +6338,7 @@ CREATE TABLE crdb_internal.%s (
 	txn_fingerprint_id         BYTES NOT NULL,
 	stmt_id                    STRING NOT NULL,
 	stmt_fingerprint_id        BYTES NOT NULL,
+	problems                   STRING[] NOT NULL,
 	query                      STRING NOT NULL,
 	status                     STRING NOT NULL,
 	start_time                 TIMESTAMP NOT NULL,
@@ -6396,6 +6397,13 @@ func populateExecutionInsights(
 		return
 	}
 	for _, insight := range response.Insights {
+		problems := tree.NewDArray(types.String)
+		for _, problem := range insight.Problems {
+			if errProblem := problems.Append(tree.NewDString(problem.String())); err != nil {
+				err = errors.CombineErrors(err, errProblem)
+			}
+		}
+
 		startTimestamp, errTimestamp := tree.MakeDTimestamp(insight.Statement.StartTime, time.Nanosecond)
 		if errTimestamp != nil {
 			err = errors.CombineErrors(err, errTimestamp)
@@ -6442,6 +6450,7 @@ func populateExecutionInsights(
 			tree.NewDBytes(tree.DBytes(sqlstatsutil.EncodeUint64ToBytes(uint64(insight.Transaction.FingerprintID)))),
 			tree.NewDString(hex.EncodeToString(insight.Statement.ID.GetBytes())),
 			tree.NewDBytes(tree.DBytes(sqlstatsutil.EncodeUint64ToBytes(uint64(insight.Statement.FingerprintID)))),
+			problems,
 			tree.NewDString(insight.Statement.Query),
 			tree.NewDString(insight.Statement.Status),
 			startTimestamp,

diff --git a/pkg/sql/logictest/testdata/logic_test/create_statements b/pkg/sql/logictest/testdata/logic_test/create_statements
@@ -249,6 +249,7 @@ CREATE TABLE crdb_internal.cluster_execution_insights (
    txn_fingerprint_id BYTES NOT NULL,
    stmt_id STRING NOT NULL,
    stmt_fingerprint_id BYTES NOT NULL,
+   problems STRING[] NOT NULL,
    query STRING NOT NULL,
    status STRING NOT NULL,
    start_time TIMESTAMP NOT NULL,
@@ -272,6 +273,7 @@ CREATE TABLE crdb_internal.cluster_execution_insights (
    txn_fingerprint_id BYTES NOT NULL,
    stmt_id STRING NOT NULL,
    stmt_fingerprint_id BYTES NOT NULL,
+   problems STRING[] NOT NULL,
    query STRING NOT NULL,
    status STRING NOT NULL,
    start_time TIMESTAMP NOT NULL,
@@ -963,6 +965,7 @@ CREATE TABLE crdb_internal.node_execution_insights (
    txn_fingerprint_id BYTES NOT NULL,
    stmt_id STRING NOT NULL,
    stmt_fingerprint_id BYTES NOT NULL,
+   problems STRING[] NOT NULL,
    query STRING NOT NULL,
    status STRING NOT NULL,
    start_time TIMESTAMP NOT NULL,
@@ -986,6 +989,7 @@ CREATE TABLE crdb_internal.node_execution_insights (
    txn_fingerprint_id BYTES NOT NULL,
    stmt_id STRING NOT NULL,
    stmt_fingerprint_id BYTES NOT NULL,
+   problems STRING[] NOT NULL,
    query STRING NOT NULL,
    status STRING NOT NULL,
    start_time TIMESTAMP NOT NULL,

@@ -9,6 +9,7 @@ go_library(
         "detector.go",
         "ingester.go",
         "insights.go",
+        "problems.go",
         "registry.go",
     ],
     embed = [":insights_go_proto"],
@@ -35,6 +36,7 @@ go_test(
         "detector_test.go",
         "ingester_test.go",
         "insights_test.go",
+        "problems_test.go",
         "registry_test.go",
     ],
     embed = [":insights"],

@@ -20,7 +20,7 @@ import (
 
 type detector interface {
 	enabled() bool
-	examine(*Statement) []Concern
+	isSlow(*Statement) bool
 }
 
 var _ detector = &compositeDetector{}
@@ -40,20 +40,14 @@ func (a compositeDetector) enabled() bool {
 	return false
 }
 
-func (a compositeDetector) examine(statement *Statement) (concerns []Concern) {
-	set := make(map[Concern]struct{})
-
+func (a compositeDetector) isSlow(statement *Statement) bool {
+	// Because some detectors may need to observe all statements to build up
+	// their baseline sense of what "normal" is, we avoid short-circuiting.
+	result := false
 	for _, d := range a.detectors {
-		for _, concern := range d.examine(statement) {
-			set[concern] = struct{}{}
-		}
+		result = d.isSlow(statement) || result
 	}
-
-	for concern := range set {
-		concerns = append(concerns, concern)
-	}
-
-	return concerns
+	return result
 }
 
 var desiredQuantiles = map[float64]float64{0.5: 0.05, 0.99: 0.001}
@@ -74,23 +68,21 @@ func (d anomalyDetector) enabled() bool {
 	return AnomalyDetectionEnabled.Get(&d.settings.SV)
 }
 
-func (d *anomalyDetector) examine(stmt *Statement) (concerns []Concern) {
+func (d *anomalyDetector) isSlow(stmt *Statement) (decision bool) {
 	if !d.enabled() {
-		return concerns
+		return
 	}
 
 	d.withFingerprintLatencySummary(stmt, func(latencySummary *quantile.Stream) {
 		latencySummary.Insert(stmt.LatencyInSeconds)
 		p50 := latencySummary.Query(0.5)
 		p99 := latencySummary.Query(0.99)
-		if stmt.LatencyInSeconds >= p99 &&
+		decision = stmt.LatencyInSeconds >= p99 &&
 			stmt.LatencyInSeconds >= 2*p50 &&
-			stmt.LatencyInSeconds >= AnomalyDetectionLatencyThreshold.Get(&d.settings.SV).Seconds() {
-			concerns = append(concerns, Concern_SlowExecution)
-		}
+			stmt.LatencyInSeconds >= AnomalyDetectionLatencyThreshold.Get(&d.settings.SV).Seconds()
 	})
 
-	return concerns
+	return
 }
 
 func (d *anomalyDetector) withFingerprintLatencySummary(
@@ -146,10 +138,6 @@ func (l latencyThresholdDetector) enabled() bool {
 	return LatencyThreshold.Get(&l.st.SV) > 0
 }
 
-func (l latencyThresholdDetector) examine(s *Statement) (concerns []Concern) {
-	if l.enabled() && s.LatencyInSeconds >= LatencyThreshold.Get(&l.st.SV).Seconds() {
-		concerns = append(concerns, Concern_SlowExecution)
-	}
-
-	return concerns
+func (l latencyThresholdDetector) isSlow(s *Statement) bool {
+	return l.enabled() && s.LatencyInSeconds >= LatencyThreshold.Get(&l.st.SV).Seconds()
 }
@@ -37,38 +37,32 @@ func TestAnyDetector(t *testing.T) {
 		require.True(t, detector.enabled())
 	})
 
-	t.Run("examine is nil without any detectors", func(t *testing.T) {
+	t.Run("isSlow is false without any detectors", func(t *testing.T) {
 		detector := &compositeDetector{}
-		require.Empty(t, detector.examine(&Statement{}))
+		require.False(t, detector.isSlow(&Statement{}))
 	})
 
-	t.Run("examine is nil without any concerned detectors", func(t *testing.T) {
+	t.Run("isSlow is false without any concerned detectors", func(t *testing.T) {
 		detector := &compositeDetector{[]detector{&fakeDetector{}, &fakeDetector{}}}
-		require.Empty(t, detector.examine(&Statement{}))
+		require.False(t, detector.isSlow(&Statement{}))
 	})
 
-	t.Run("examine combines detector concerns (first)", func(t *testing.T) {
-		detector := &compositeDetector{[]detector{
-			&fakeDetector{stubExamine: []Concern{Concern_SlowExecution}},
-			&fakeDetector{},
-		}}
-		require.Equal(t, []Concern{Concern_SlowExecution}, detector.examine(&Statement{}))
+	t.Run("isSlow is true with at least one concerned detector", func(t *testing.T) {
+		detector := &compositeDetector{[]detector{&fakeDetector{stubIsSlow: true}, &fakeDetector{}}}
+		require.True(t, detector.isSlow(&Statement{}))
 	})
 
-	t.Run("examine combines detector concerns (second)", func(t *testing.T) {
-		detector := &compositeDetector{[]detector{
-			&fakeDetector{},
-			&fakeDetector{stubExamine: []Concern{Concern_SlowExecution}},
-		}}
-		require.Equal(t, []Concern{Concern_SlowExecution}, detector.examine(&Statement{}))
-	})
+	t.Run("isSlow consults all detectors without short-circuiting", func(t *testing.T) {
+		// Detector implementations may wish to observe all statements, to
+		// build up their baseline sense of what "usual" is. To short-circuit
+		// would deny them that chance.
+		d1 := &fakeDetector{stubIsSlow: true}
+		d2 := &fakeDetector{stubIsSlow: true}
 
-	t.Run("examine uniqs detector concerns", func(t *testing.T) {
-		detector := &compositeDetector{[]detector{
-			&fakeDetector{stubExamine: []Concern{Concern_SlowExecution}},
-			&fakeDetector{stubExamine: []Concern{Concern_SlowExecution}},
-		}}
-		require.Equal(t, []Concern{Concern_SlowExecution}, detector.examine(&Statement{}))
+		detector := &compositeDetector{[]detector{d1, d2}}
+		detector.isSlow(&Statement{})
+		require.True(t, d1.isSlowCalled, "the first detector should be consulted")
+		require.True(t, d2.isSlowCalled, "the second detector should be consulted")
 	})
 }
 
@@ -85,7 +79,7 @@ func TestLatencyQuantileDetector(t *testing.T) {
 		require.True(t, d.enabled())
 	})
 
-	t.Run("examine", func(t *testing.T) {
+	t.Run("isSlow", func(t *testing.T) {
 		ctx := context.Background()
 		st := cluster.MakeTestingClusterSettings()
 		AnomalyDetectionEnabled.Override(ctx, &st.SV, true)
@@ -95,28 +89,30 @@ func TestLatencyQuantileDetector(t *testing.T) {
 			name             string
 			seedLatency      time.Duration
 			candidateLatency time.Duration
-			concerns         []Concern
+			isSlow           bool
 		}{{
 			name:             "false with normal latency",
 			seedLatency:      100 * time.Millisecond,
 			candidateLatency: 100 * time.Millisecond,
+			isSlow:           false,
 		}, {
 			name:             "true with higher latency",
 			seedLatency:      100 * time.Millisecond,
 			candidateLatency: 200 * time.Millisecond,
-			concerns:         []Concern{Concern_SlowExecution},
+			isSlow:           true,
 		}, {
 			name:             "false with higher latency under interesting threshold",
 			seedLatency:      10 * time.Millisecond,
 			candidateLatency: 20 * time.Millisecond,
+			isSlow:           false,
 		}}
 		for _, test := range tests {
 			t.Run(test.name, func(t *testing.T) {
 				d := newAnomalyDetector(st, NewMetrics())
 				for i := 0; i < 1000; i++ {
-					d.examine(&Statement{LatencyInSeconds: test.seedLatency.Seconds()})
+					d.isSlow(&Statement{LatencyInSeconds: test.seedLatency.Seconds()})
 				}
-				require.Equal(t, test.concerns, d.examine(&Statement{LatencyInSeconds: test.candidateLatency.Seconds()}))
+				require.Equal(t, test.isSlow, d.isSlow(&Statement{LatencyInSeconds: test.candidateLatency.Seconds()}))
 			})
 		}
 	})
@@ -173,7 +169,7 @@ func TestLatencyQuantileDetector(t *testing.T) {
 				d := newAnomalyDetector(st, metrics)
 				// Show the detector `test.fingerprints` distinct fingerprints.
 				for i := 0; i < test.fingerprints; i++ {
-					d.examine(&Statement{
+					d.isSlow(&Statement{
 						LatencyInSeconds: AnomalyDetectionLatencyThreshold.Get(&st.SV).Seconds(),
 						FingerprintID:    roachpb.StmtFingerprintID(i),
 					})
@@ -192,7 +188,7 @@ func BenchmarkLatencyQuantileDetector(b *testing.B) {
 	AnomalyDetectionEnabled.Override(context.Background(), &settings.SV, true)
 	d := newAnomalyDetector(settings, NewMetrics())
 	for i := 0; i < b.N; i++ {
-		d.examine(&Statement{
+		d.isSlow(&Statement{
 			LatencyInSeconds: random.Float64(),
 		})
 	}
@@ -213,39 +209,41 @@ func TestLatencyThresholdDetector(t *testing.T) {
 		require.True(t, detector.enabled())
 	})
 
-	t.Run("examine nil when disabled", func(t *testing.T) {
+	t.Run("isSlow false when disabled", func(t *testing.T) {
 		st := cluster.MakeTestingClusterSettings()
 		LatencyThreshold.Override(context.Background(), &st.SV, 0)
 		detector := latencyThresholdDetector{st: st}
-		require.Empty(t, detector.examine(&Statement{LatencyInSeconds: 1}))
+		require.False(t, detector.isSlow(&Statement{LatencyInSeconds: 1}))
 	})
 
-	t.Run("examine nil when fast enough", func(t *testing.T) {
+	t.Run("isSlow false when fast enough", func(t *testing.T) {
 		st := cluster.MakeTestingClusterSettings()
 		LatencyThreshold.Override(context.Background(), &st.SV, 1*time.Second)
 		detector := latencyThresholdDetector{st: st}
-		require.Empty(t, detector.examine(&Statement{LatencyInSeconds: 0.5}))
+		require.False(t, detector.isSlow(&Statement{LatencyInSeconds: 0.5}))
 	})
 
-	t.Run("examine slow beyond threshold", func(t *testing.T) {
+	t.Run("isSlow true beyond threshold", func(t *testing.T) {
 		st := cluster.MakeTestingClusterSettings()
 		LatencyThreshold.Override(context.Background(), &st.SV, 1*time.Second)
 		detector := latencyThresholdDetector{st: st}
-		require.Equal(t, []Concern{Concern_SlowExecution}, detector.examine(&Statement{LatencyInSeconds: 1}))
+		require.True(t, detector.isSlow(&Statement{LatencyInSeconds: 1}))
 	})
 }
 
 type fakeDetector struct {
-	stubEnabled bool
-	stubExamine []Concern
+	stubEnabled  bool
+	stubIsSlow   bool
+	isSlowCalled bool
 }
 
-func (f fakeDetector) enabled() bool {
+func (f *fakeDetector) enabled() bool {
 	return f.stubEnabled
 }
 
-func (f fakeDetector) examine(_ *Statement) []Concern {
-	return f.stubExamine
+func (f *fakeDetector) isSlow(*Statement) bool {
+	f.isSlowCalled = true
+	return f.stubIsSlow
 }
 
 var _ detector = &fakeDetector{}
@@ -78,6 +78,16 @@ var AnomalyDetectionMemoryLimit = settings.RegisterByteSizeSetting(
 	1024*1024,
 )
 
+// HighRetryCountThreshold sets the number of times a slow statement must have
+// been retried to be marked as having a high retry count.
+var HighRetryCountThreshold = settings.RegisterIntSetting(
+	settings.TenantWritable,
+	"sql.insights.high_retry_count.threshold",
+	"the number of retries a slow statement must have undergone for its high retry count to be highlighted as a potential problem",
+	10,
+	settings.NonNegativeInt,
+).WithPublic()
+
 // Metrics holds running measurements of various outliers-related runtime stats.
 type Metrics struct {
 	// Fingerprints measures the number of statement fingerprints being monitored for