From 4a1946f48df846467e35f4538775fa0f9cbec04e Mon Sep 17 00:00:00 2001 From: "Eduardo J. Ortega U" <5791035+ejortegau@users.noreply.github.com> Date: Tue, 25 Jul 2023 12:22:25 +0200 Subject: [PATCH 1/5] Add dry-run/monitoring-only mode for TxThrottler This change adds the ability to run the transaction throttler in dry-run/ monitoring-only mode. This means that it will only emit metrics related to whether throttling would or would not have take place, but will not actually throttle. This can be useful when deploying, to get a sense of how often throttling can happen without actually having it happen. Signed-off-by: Eduardo J. Ortega U <5791035+ejortegau@users.noreply.github.com> --- go/flags/endtoend/vttablet.txt | 1 + .../vttablet/tabletserver/tabletenv/config.go | 3 ++ .../tabletserver/txthrottler/tx_throttler.go | 33 ++++++++---- .../txthrottler/tx_throttler_test.go | 51 +++++++++++++++++++ 4 files changed, 77 insertions(+), 11 deletions(-) diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index fc9a28bc541..276c8342bde 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -349,6 +349,7 @@ Usage of vttablet: --tx-throttler-default-priority int Default priority assigned to queries that lack priority information (default 100) --tx-throttler-healthcheck-cells strings Synonym to -tx_throttler_healthcheck_cells --tx-throttler-tablet-types strings A comma-separated list of tablet types. Only tablets of this type are monitored for replication lag by the transaction throttler. Supported types are replica and/or rdonly. (default replica) + --tx-throttler-dry-run If present, the TxThrottler only records metrics about requests received and throttled, but does not actually throttle any requests. --tx_throttler_config string The configuration of the transaction throttler as a text-formatted throttlerdata.Configuration protocol buffer message. (default "target_replication_lag_sec:2 max_replication_lag_sec:10 initial_rate:100 max_increase:1 emergency_decrease:0.5 min_duration_between_increases_sec:40 max_duration_between_increases_sec:62 min_duration_between_decreases_sec:20 spread_backlog_across_sec:20 age_bad_rate_after_sec:180 bad_rate_increase:0.1 max_rate_approach_threshold:0.9") --tx_throttler_healthcheck_cells strings A comma-separated list of cells. Only tabletservers running in these cells will be monitored for replication lag by the transaction throttler. --unhealthy_threshold duration replication lag after which a replica is considered unhealthy (default 2h0m0s) diff --git a/go/vt/vttablet/tabletserver/tabletenv/config.go b/go/vt/vttablet/tabletserver/tabletenv/config.go index 1be90478be2..631e880fe24 100644 --- a/go/vt/vttablet/tabletserver/tabletenv/config.go +++ b/go/vt/vttablet/tabletserver/tabletenv/config.go @@ -185,6 +185,7 @@ func registerTabletEnvFlags(fs *pflag.FlagSet) { flagutil.DualFormatStringListVar(fs, ¤tConfig.TxThrottlerHealthCheckCells, "tx_throttler_healthcheck_cells", defaultConfig.TxThrottlerHealthCheckCells, "A comma-separated list of cells. Only tabletservers running in these cells will be monitored for replication lag by the transaction throttler.") fs.IntVar(¤tConfig.TxThrottlerDefaultPriority, "tx-throttler-default-priority", defaultConfig.TxThrottlerDefaultPriority, "Default priority assigned to queries that lack priority information") fs.Var(currentConfig.TxThrottlerTabletTypes, "tx-throttler-tablet-types", "A comma-separated list of tablet types. Only tablets of this type are monitored for replication lag by the transaction throttler. Supported types are replica and/or rdonly.") + fs.BoolVar(¤tConfig.TxThrottlerDryRun, "tx-throttler-dry-run", defaultConfig.TxThrottlerDryRun, "If present, the TxThrottler only records metrics about requests received and throttled, but does not actually throttle any requests.") fs.BoolVar(&enableHotRowProtection, "enable_hot_row_protection", false, "If true, incoming transactions for the same row (range) will be queued and cannot consume all txpool slots.") fs.BoolVar(&enableHotRowProtectionDryRun, "enable_hot_row_protection_dry_run", false, "If true, hot row protection is not enforced but logs if transactions would have been queued.") @@ -364,6 +365,7 @@ type TabletConfig struct { TxThrottlerHealthCheckCells []string `json:"-"` TxThrottlerDefaultPriority int `json:"-"` TxThrottlerTabletTypes *topoproto.TabletTypeListFlag `json:"-"` + TxThrottlerDryRun bool `json:"-"` EnableTableGC bool `json:"-"` // can be turned off programmatically by tests @@ -832,6 +834,7 @@ var defaultConfig = TabletConfig{ TxThrottlerHealthCheckCells: []string{}, TxThrottlerDefaultPriority: sqlparser.MaxPriorityValue, // This leads to all queries being candidates to throttle TxThrottlerTabletTypes: &topoproto.TabletTypeListFlag{topodatapb.TabletType_REPLICA}, + TxThrottlerDryRun: false, TransactionLimitConfig: defaultTransactionLimitConfig(), diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go index 30e2ec19c56..ee24bb1361f 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler.go @@ -136,7 +136,7 @@ type txThrottler struct { // state holds an open transaction throttler state. It is nil // if the TransactionThrottler is closed. - state *txThrottlerState + state txThrottlerState target *querypb.Target topoServer *topo.Server @@ -155,6 +155,10 @@ type txThrottlerConfig struct { // returns false. enabled bool + // if dryRun is true, the txThrottler will run only on monitoring mode, meaning that it will increase counters for + // total and actually throttled requests, but it will not actually return that a transaction should be throttled. + dryRun bool + throttlerConfig *throttlerdatapb.Configuration // healthCheckCells stores the cell names in which running vttablets will be monitored for // replication lag. @@ -164,8 +168,14 @@ type txThrottlerConfig struct { tabletTypes *topoproto.TabletTypeListFlag } -// txThrottlerState holds the state of an open TxThrottler object. -type txThrottlerState struct { +type txThrottlerState interface { + deallocateResources() + StatsUpdate(tabletStats *discovery.TabletHealth) + throttle() bool +} + +// txThrottlerStateImpl holds the state of an open TxThrottler object. +type txThrottlerStateImpl struct { config *txThrottlerConfig // throttleMu serializes calls to throttler.Throttler.Throttle(threadId). @@ -193,6 +203,7 @@ func NewTxThrottler(env tabletenv.Env, topoServer *topo.Server) TxThrottler { throttlerConfig = &txThrottlerConfig{ enabled: true, + dryRun: env.Config().TxThrottlerDryRun, tabletTypes: env.Config().TxThrottlerTabletTypes, throttlerConfig: env.Config().TxThrottlerConfig.Get(), healthCheckCells: healthCheckCells, @@ -266,10 +277,10 @@ func (t *txThrottler) Throttle(priority int) (result bool) { t.requestsThrottled.Add(1) } - return result + return result && !t.config.dryRun } -func newTxThrottlerState(topoServer *topo.Server, config *txThrottlerConfig, target *querypb.Target) (*txThrottlerState, error) { +func newTxThrottlerState(topoServer *topo.Server, config *txThrottlerConfig, target *querypb.Target) (txThrottlerState, error) { maxReplicationLagModuleConfig := throttler.MaxReplicationLagModuleConfig{Configuration: config.throttlerConfig} t, err := throttlerFactory( @@ -286,7 +297,7 @@ func newTxThrottlerState(topoServer *topo.Server, config *txThrottlerConfig, tar t.Close() return nil, err } - result := &txThrottlerState{ + result := &txThrottlerStateImpl{ config: config, throttler: t, } @@ -309,7 +320,7 @@ func newTxThrottlerState(topoServer *topo.Server, config *txThrottlerConfig, tar return result, nil } -func createTxThrottlerHealthCheck(topoServer *topo.Server, config *txThrottlerConfig, result *txThrottlerState, cell string) { +func createTxThrottlerHealthCheck(topoServer *topo.Server, config *txThrottlerConfig, result *txThrottlerStateImpl, cell string) { ctx, cancel := context.WithCancel(context.Background()) result.stopHealthCheck = cancel result.healthCheck = healthCheckFactory(topoServer, cell, config.healthCheckCells) @@ -326,7 +337,7 @@ func createTxThrottlerHealthCheck(topoServer *topo.Server, config *txThrottlerCo }(ctx) } -func (ts *txThrottlerState) throttle() bool { +func (ts *txThrottlerStateImpl) throttle() bool { if ts.throttler == nil { log.Error("throttle called after deallocateResources was called") return false @@ -337,7 +348,7 @@ func (ts *txThrottlerState) throttle() bool { return ts.throttler.Throttle(0 /* threadId */) > 0 } -func (ts *txThrottlerState) deallocateResources() { +func (ts *txThrottlerStateImpl) deallocateResources() { // We don't really need to nil out the fields here // as deallocateResources is not expected to be called // more than once, but it doesn't hurt to do so. @@ -349,14 +360,14 @@ func (ts *txThrottlerState) deallocateResources() { ts.healthCheck.Close() ts.healthCheck = nil - // After ts.healthCheck is closed txThrottlerState.StatsUpdate() is guaranteed not + // After ts.healthCheck is closed txThrottlerStateImpl.StatsUpdate() is guaranteed not // to be executing, so we can safely close the throttler. ts.throttler.Close() ts.throttler = nil } // StatsUpdate updates the health of a tablet with the given healthcheck. -func (ts *txThrottlerState) StatsUpdate(tabletStats *discovery.TabletHealth) { +func (ts *txThrottlerStateImpl) StatsUpdate(tabletStats *discovery.TabletHealth) { if ts.config.tabletTypes == nil { return } diff --git a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go index 52cf3d9396e..1eaf667156d 100644 --- a/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go +++ b/go/vt/vttablet/tabletserver/txthrottler/tx_throttler_test.go @@ -176,3 +176,54 @@ func TestNewTxThrottler(t *testing.T) { assert.Equal(t, []string{"cell1", "cell2"}, throttlerImpl.config.healthCheckCells) } } + +func TestDryRunThrottler(t *testing.T) { + config := tabletenv.NewDefaultConfig() + env := tabletenv.NewEnv(config, t.Name()) + + testCases := []struct { + Name string + txThrottlerStateShouldThrottle bool + throttlerDryRun bool + expectedResult bool + }{ + {Name: "Real run throttles when txThrottlerStateImpl says it should", txThrottlerStateShouldThrottle: true, throttlerDryRun: false, expectedResult: true}, + {Name: "Real run does not throttle when txThrottlerStateImpl says it should not", txThrottlerStateShouldThrottle: false, throttlerDryRun: false, expectedResult: false}, + {Name: "Dry run does not throttle when txThrottlerStateImpl says it should", txThrottlerStateShouldThrottle: true, throttlerDryRun: true, expectedResult: false}, + {Name: "Dry run does not throttle when txThrottlerStateImpl says it should not", txThrottlerStateShouldThrottle: false, throttlerDryRun: true, expectedResult: false}, + } + + for _, aTestCase := range testCases { + theTestCase := aTestCase + + t.Run(theTestCase.Name, func(t *testing.T) { + aTxThrottler := &txThrottler{ + config: &txThrottlerConfig{ + enabled: true, + dryRun: theTestCase.throttlerDryRun, + }, + state: &mockTxThrottlerState{shouldThrottle: theTestCase.txThrottlerStateShouldThrottle}, + throttlerRunning: env.Exporter().NewGauge("TransactionThrottlerRunning", "transaction throttler running state"), + requestsTotal: env.Exporter().NewCounter("TransactionThrottlerRequests", "transaction throttler requests"), + requestsThrottled: env.Exporter().NewCounter("TransactionThrottlerThrottled", "transaction throttler requests throttled"), + } + + assert.Equal(t, theTestCase.expectedResult, aTxThrottler.Throttle(100)) + }) + } +} + +type mockTxThrottlerState struct { + shouldThrottle bool +} + +func (t *mockTxThrottlerState) deallocateResources() { + +} +func (t *mockTxThrottlerState) StatsUpdate(tabletStats *discovery.TabletHealth) { + +} + +func (t *mockTxThrottlerState) throttle() bool { + return t.shouldThrottle +} From 50c0db045991fff19f66e96db4bad183c69967b1 Mon Sep 17 00:00:00 2001 From: "Eduardo J. Ortega U" <5791035+ejortegau@users.noreply.github.com> Date: Tue, 25 Jul 2023 12:47:53 +0200 Subject: [PATCH 2/5] Fix unit test Signed-off-by: Eduardo J. Ortega U <5791035+ejortegau@users.noreply.github.com> --- go/flags/endtoend/vttablet.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index 276c8342bde..d95dd2bfb57 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -347,9 +347,9 @@ Usage of vttablet: --twopc_enable if the flag is on, 2pc is enabled. Other 2pc flags must be supplied. --tx-throttler-config string Synonym to -tx_throttler_config (default "target_replication_lag_sec:2 max_replication_lag_sec:10 initial_rate:100 max_increase:1 emergency_decrease:0.5 min_duration_between_increases_sec:40 max_duration_between_increases_sec:62 min_duration_between_decreases_sec:20 spread_backlog_across_sec:20 age_bad_rate_after_sec:180 bad_rate_increase:0.1 max_rate_approach_threshold:0.9") --tx-throttler-default-priority int Default priority assigned to queries that lack priority information (default 100) + --tx-throttler-dry-run If present, the TxThrottler only records metrics about requests received and throttled, but does not actually throttle any requests. --tx-throttler-healthcheck-cells strings Synonym to -tx_throttler_healthcheck_cells --tx-throttler-tablet-types strings A comma-separated list of tablet types. Only tablets of this type are monitored for replication lag by the transaction throttler. Supported types are replica and/or rdonly. (default replica) - --tx-throttler-dry-run If present, the TxThrottler only records metrics about requests received and throttled, but does not actually throttle any requests. --tx_throttler_config string The configuration of the transaction throttler as a text-formatted throttlerdata.Configuration protocol buffer message. (default "target_replication_lag_sec:2 max_replication_lag_sec:10 initial_rate:100 max_increase:1 emergency_decrease:0.5 min_duration_between_increases_sec:40 max_duration_between_increases_sec:62 min_duration_between_decreases_sec:20 spread_backlog_across_sec:20 age_bad_rate_after_sec:180 bad_rate_increase:0.1 max_rate_approach_threshold:0.9") --tx_throttler_healthcheck_cells strings A comma-separated list of cells. Only tabletservers running in these cells will be monitored for replication lag by the transaction throttler. --unhealthy_threshold duration replication lag after which a replica is considered unhealthy (default 2h0m0s) From d2dc737e69b6e8d6e8f52204e7349fa14996b744 Mon Sep 17 00:00:00 2001 From: "Eduardo J. Ortega U" <5791035+ejortegau@users.noreply.github.com> Date: Wed, 26 Jul 2023 09:51:40 +0200 Subject: [PATCH 3/5] Include TxThrottler dry run mode in release notes. Signed-off-by: Eduardo J. Ortega U <5791035+ejortegau@users.noreply.github.com> --- changelog/18.0/18.0.0/summary.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md index 2bf76482316..ef305211f8c 100644 --- a/changelog/18.0/18.0.0/summary.md +++ b/changelog/18.0/18.0.0/summary.md @@ -89,6 +89,14 @@ Vitess upgrade process from an earlier version if you need to use such a workflo Any MoveTables or Migrate workflow that moves a sequence table should only be run after all vitess components have been upgraded, and no upgrade should be done while such a workflow is in progress. +#### New Dry-run/monitoring-only mode for the transaction throttler + +A new CLI flag `--tx-throttler-dry-run` to set the Transaction Throttler to monitoring-only/dry-run mode has been added. +If the transaction throttler is enabled with `--enable-tx-throttler` and the new dry-run flag is also specified, the +tablet will not actually throttle any transactions; however, it will increase the counters for transactions throttled +(`vttablet_transaction_throttler_throttled`). This allows to deploy the transaction throttler in production and gain +observability on how much throttling would take place, without actually throttling any requests. + ### Docker #### Bookworm added and made default From ae33e6779f8db9bc920c5ffe7befb212a495260f Mon Sep 17 00:00:00 2001 From: "Eduardo J. Ortega U" <5791035+ejortegau@users.noreply.github.com> Date: Thu, 27 Jul 2023 12:54:21 +0200 Subject: [PATCH 4/5] Address PR comments. Signed-off-by: Eduardo J. Ortega U <5791035+ejortegau@users.noreply.github.com> --- go/flags/endtoend/vttablet.txt | 2 +- go/vt/vttablet/tabletserver/tabletenv/config.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index d95dd2bfb57..61c0ea41d00 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -347,7 +347,7 @@ Usage of vttablet: --twopc_enable if the flag is on, 2pc is enabled. Other 2pc flags must be supplied. --tx-throttler-config string Synonym to -tx_throttler_config (default "target_replication_lag_sec:2 max_replication_lag_sec:10 initial_rate:100 max_increase:1 emergency_decrease:0.5 min_duration_between_increases_sec:40 max_duration_between_increases_sec:62 min_duration_between_decreases_sec:20 spread_backlog_across_sec:20 age_bad_rate_after_sec:180 bad_rate_increase:0.1 max_rate_approach_threshold:0.9") --tx-throttler-default-priority int Default priority assigned to queries that lack priority information (default 100) - --tx-throttler-dry-run If present, the TxThrottler only records metrics about requests received and throttled, but does not actually throttle any requests. + --tx-throttler-dry-run If present, the transaction throttler only records metrics about requests received and throttled, but does not actually throttle any requests. --tx-throttler-healthcheck-cells strings Synonym to -tx_throttler_healthcheck_cells --tx-throttler-tablet-types strings A comma-separated list of tablet types. Only tablets of this type are monitored for replication lag by the transaction throttler. Supported types are replica and/or rdonly. (default replica) --tx_throttler_config string The configuration of the transaction throttler as a text-formatted throttlerdata.Configuration protocol buffer message. (default "target_replication_lag_sec:2 max_replication_lag_sec:10 initial_rate:100 max_increase:1 emergency_decrease:0.5 min_duration_between_increases_sec:40 max_duration_between_increases_sec:62 min_duration_between_decreases_sec:20 spread_backlog_across_sec:20 age_bad_rate_after_sec:180 bad_rate_increase:0.1 max_rate_approach_threshold:0.9") diff --git a/go/vt/vttablet/tabletserver/tabletenv/config.go b/go/vt/vttablet/tabletserver/tabletenv/config.go index 631e880fe24..aaaae26c23d 100644 --- a/go/vt/vttablet/tabletserver/tabletenv/config.go +++ b/go/vt/vttablet/tabletserver/tabletenv/config.go @@ -185,7 +185,7 @@ func registerTabletEnvFlags(fs *pflag.FlagSet) { flagutil.DualFormatStringListVar(fs, ¤tConfig.TxThrottlerHealthCheckCells, "tx_throttler_healthcheck_cells", defaultConfig.TxThrottlerHealthCheckCells, "A comma-separated list of cells. Only tabletservers running in these cells will be monitored for replication lag by the transaction throttler.") fs.IntVar(¤tConfig.TxThrottlerDefaultPriority, "tx-throttler-default-priority", defaultConfig.TxThrottlerDefaultPriority, "Default priority assigned to queries that lack priority information") fs.Var(currentConfig.TxThrottlerTabletTypes, "tx-throttler-tablet-types", "A comma-separated list of tablet types. Only tablets of this type are monitored for replication lag by the transaction throttler. Supported types are replica and/or rdonly.") - fs.BoolVar(¤tConfig.TxThrottlerDryRun, "tx-throttler-dry-run", defaultConfig.TxThrottlerDryRun, "If present, the TxThrottler only records metrics about requests received and throttled, but does not actually throttle any requests.") + fs.BoolVar(¤tConfig.TxThrottlerDryRun, "tx-throttler-dry-run", defaultConfig.TxThrottlerDryRun, "If present, the transaction throttler only records metrics about requests received and throttled, but does not actually throttle any requests.") fs.BoolVar(&enableHotRowProtection, "enable_hot_row_protection", false, "If true, incoming transactions for the same row (range) will be queued and cannot consume all txpool slots.") fs.BoolVar(&enableHotRowProtectionDryRun, "enable_hot_row_protection_dry_run", false, "If true, hot row protection is not enforced but logs if transactions would have been queued.") From 3fd31f7e0ab45a9e680edf0f37c5e92d592651ec Mon Sep 17 00:00:00 2001 From: "Eduardo J. Ortega U" <5791035+ejortegau@users.noreply.github.com> Date: Wed, 2 Aug 2023 10:07:50 +0200 Subject: [PATCH 5/5] Address PR comment. Signed-off-by: Eduardo J. Ortega U <5791035+ejortegau@users.noreply.github.com> --- changelog/18.0/18.0.0/summary.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md index 10aa505b0a0..903e6a95895 100644 --- a/changelog/18.0/18.0.0/summary.md +++ b/changelog/18.0/18.0.0/summary.md @@ -106,8 +106,8 @@ upgraded, and no upgrade should be done while such a workflow is in progress. A new CLI flag `--tx-throttler-dry-run` to set the Transaction Throttler to monitoring-only/dry-run mode has been added. If the transaction throttler is enabled with `--enable-tx-throttler` and the new dry-run flag is also specified, the tablet will not actually throttle any transactions; however, it will increase the counters for transactions throttled -(`vttablet_transaction_throttler_throttled`). This allows to deploy the transaction throttler in production and gain -observability on how much throttling would take place, without actually throttling any requests. +(`vttablet_transaction_throttler_throttled`). This allows users to deploy the transaction throttler in production and +gain observability on how much throttling would take place, without actually throttling any requests. ### Docker