From aede9e6826484b6f1938fc69f123da42ef482559 Mon Sep 17 00:00:00 2001 From: Michael Berlin Date: Thu, 20 Aug 2015 23:48:26 -0700 Subject: [PATCH] vtctl: WaitForFilteredReplication runs explicit health check before checking health records. Removed previous heuristic to ignore health error messages because it was aimed at the same symptom: Healthcheck didn't caught up and we needed to wait for it. --- go/vt/vtctl/vtctl.go | 26 ++++++------------- .../wait_for_filtered_replication_test.go | 11 +++++++- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/go/vt/vtctl/vtctl.go b/go/vt/vtctl/vtctl.go index 273fcfebe94..dce59a78c3c 100644 --- a/go/vt/vtctl/vtctl.go +++ b/go/vt/vtctl/vtctl.go @@ -1340,11 +1340,6 @@ func commandWaitForFilteredReplication(ctx context.Context, wr *wrangler.Wrangle "Specifies the maximum delay, in seconds, the filtered replication of the"+ " given destination shard should lag behind the source shard. When"+ " higher, the command will block and wait for the delay to decrease.") - // In case of automated reshardings, a tablet may still report itself as - // unhealthy e.g. because CopySchemaShard wasn't run yet and the db doesn't - // exist yet. - allowedHealthErrorsInARow := subFlags.Int("allowed_health_errors_in_a_row", 3, - "Limit of observed health errors in a row after which the command will fail and no longer wait for the tablet to become healthy.") if err := subFlags.Parse(args); err != nil { return err @@ -1376,6 +1371,13 @@ func commandWaitForFilteredReplication(ctx context.Context, wr *wrangler.Wrangle return fmt.Errorf("cannot get EndPoint for master tablet record: %v record: %v", err, tabletInfo) } + // Always run an explicit healthcheck first to make sure we don't see any outdated values. + // This is especially true for tests and automation where there is no pause of multiple seconds + // between commands and the periodic healthcheck did not run again yet. + if err := wr.TabletManagerClient().RunHealthCheck(ctx, tabletInfo, pb.TabletType_REPLICA); err != nil { + return fmt.Errorf("failed to run explicit healthcheck on tablet: %v err: %v", tabletInfo, err) + } + // pass in a non-UNKNOWN tablet type to not use sessionId conn, err := tabletconn.GetDialer()(ctx, ep, "", "", pb.TabletType_MASTER, 30*time.Second) if err != nil { @@ -1387,7 +1389,6 @@ func commandWaitForFilteredReplication(ctx context.Context, wr *wrangler.Wrangle return fmt.Errorf("could not stream health records from tablet: %v err: %v", alias, err) } var lastSeenDelay int - healthErrorsInARow := 0 for { select { case <-ctx.Done(): @@ -1401,19 +1402,8 @@ func commandWaitForFilteredReplication(ctx context.Context, wr *wrangler.Wrangle return fmt.Errorf("health record does not include RealtimeStats message. tablet: %v health record: %v", alias, shr) } if stats.HealthError != "" { - healthErrorsInARow++ - if healthErrorsInARow >= *allowedHealthErrorsInARow { - return fmt.Errorf("tablet is not healthy. tablet: %v health record: %v", alias, shr) - } - wr.Logger().Printf("Tablet is not healthy. Waiting for %v more health"+ - " record(s) before the command will fail."+ - " tablet: %v health record: %v\n", - (*allowedHealthErrorsInARow - healthErrorsInARow), alias, shr) - continue - } else { - healthErrorsInARow = 0 + return fmt.Errorf("tablet is not healthy. tablet: %v health record: %v", alias, shr) } - if stats.BinlogPlayersCount == 0 { return fmt.Errorf("no filtered replication running on tablet: %v health record: %v", alias, shr) } diff --git a/go/vt/wrangler/testlib/wait_for_filtered_replication_test.go b/go/vt/wrangler/testlib/wait_for_filtered_replication_test.go index 33e73509542..a94aa008ec2 100644 --- a/go/vt/wrangler/testlib/wait_for_filtered_replication_test.go +++ b/go/vt/wrangler/testlib/wait_for_filtered_replication_test.go @@ -14,6 +14,7 @@ import ( "golang.org/x/net/context" "github.com/youtube/vitess/go/vt/logutil" + "github.com/youtube/vitess/go/vt/tabletmanager" "github.com/youtube/vitess/go/vt/tabletmanager/tmclient" "github.com/youtube/vitess/go/vt/tabletserver" "github.com/youtube/vitess/go/vt/tabletserver/grpcqueryservice" @@ -63,7 +64,7 @@ func TestWaitForFilteredReplication_noFilteredReplication(t *testing.T) { } // TestWaitForFilteredReplication_unhealthy checks that -// vtctl WaitForFilteredReplication fails eventually when a tablet is not healthy. +// vtctl WaitForFilteredReplication fails when a tablet is not healthy. func TestWaitForFilteredReplication_unhealthy(t *testing.T) { unhealthy := &pbq.RealtimeStats{ HealthError: "WaitForFilteredReplication: unhealthy test", @@ -94,6 +95,14 @@ func waitForFilteredReplication(t *testing.T, expectedErr string, initialStats * ctx := context.Background() wr.SetSourceShards(ctx, keyspace, destShard, []*pbt.TabletAlias{source.Tablet.GetAlias()}, nil) + // Set a BinlogPlayerMap to avoid a nil panic when the explicit RunHealthCheck + // is called by WaitForFilteredReplication. + // Note that for this test we don't mock the BinlogPlayerMap i.e. although + // its state says no filtered replication is running, the code under test will + // observe otherwise because we call SqlQuery.BroadcastHealth() directly and + // skip going through the tabletmanager's agent. + dest.Agent.BinlogPlayerMap = tabletmanager.NewBinlogPlayerMap(ts, nil, nil) + // Use real, but trimmed down QueryService. testConfig := tabletserver.DefaultQsConfig testConfig.EnablePublishStats = false