Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vtctl: WaitForFilteredReplication runs explicit health check before c… #1016

Merged
merged 1 commit into from
Aug 21, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 8 additions & 18 deletions go/vt/vtctl/vtctl.go
Original file line number Diff line number Diff line change
Expand Up @@ -1340,11 +1340,6 @@ func commandWaitForFilteredReplication(ctx context.Context, wr *wrangler.Wrangle
"Specifies the maximum delay, in seconds, the filtered replication of the"+
" given destination shard should lag behind the source shard. When"+
" higher, the command will block and wait for the delay to decrease.")
// In case of automated reshardings, a tablet may still report itself as
// unhealthy e.g. because CopySchemaShard wasn't run yet and the db doesn't
// exist yet.
allowedHealthErrorsInARow := subFlags.Int("allowed_health_errors_in_a_row", 3,
"Limit of observed health errors in a row after which the command will fail and no longer wait for the tablet to become healthy.")

if err := subFlags.Parse(args); err != nil {
return err
Expand Down Expand Up @@ -1376,6 +1371,13 @@ func commandWaitForFilteredReplication(ctx context.Context, wr *wrangler.Wrangle
return fmt.Errorf("cannot get EndPoint for master tablet record: %v record: %v", err, tabletInfo)
}

// Always run an explicit healthcheck first to make sure we don't see any outdated values.
// This is especially true for tests and automation where there is no pause of multiple seconds
// between commands and the periodic healthcheck did not run again yet.
if err := wr.TabletManagerClient().RunHealthCheck(ctx, tabletInfo, pb.TabletType_REPLICA); err != nil {
return fmt.Errorf("failed to run explicit healthcheck on tablet: %v err: %v", tabletInfo, err)
}

// pass in a non-UNKNOWN tablet type to not use sessionId
conn, err := tabletconn.GetDialer()(ctx, ep, "", "", pb.TabletType_MASTER, 30*time.Second)
if err != nil {
Expand All @@ -1387,7 +1389,6 @@ func commandWaitForFilteredReplication(ctx context.Context, wr *wrangler.Wrangle
return fmt.Errorf("could not stream health records from tablet: %v err: %v", alias, err)
}
var lastSeenDelay int
healthErrorsInARow := 0
for {
select {
case <-ctx.Done():
Expand All @@ -1401,19 +1402,8 @@ func commandWaitForFilteredReplication(ctx context.Context, wr *wrangler.Wrangle
return fmt.Errorf("health record does not include RealtimeStats message. tablet: %v health record: %v", alias, shr)
}
if stats.HealthError != "" {
healthErrorsInARow++
if healthErrorsInARow >= *allowedHealthErrorsInARow {
return fmt.Errorf("tablet is not healthy. tablet: %v health record: %v", alias, shr)
}
wr.Logger().Printf("Tablet is not healthy. Waiting for %v more health"+
" record(s) before the command will fail."+
" tablet: %v health record: %v\n",
(*allowedHealthErrorsInARow - healthErrorsInARow), alias, shr)
continue
} else {
healthErrorsInARow = 0
return fmt.Errorf("tablet is not healthy. tablet: %v health record: %v", alias, shr)
}

if stats.BinlogPlayersCount == 0 {
return fmt.Errorf("no filtered replication running on tablet: %v health record: %v", alias, shr)
}
Expand Down
11 changes: 10 additions & 1 deletion go/vt/wrangler/testlib/wait_for_filtered_replication_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"golang.org/x/net/context"

"github.com/youtube/vitess/go/vt/logutil"
"github.com/youtube/vitess/go/vt/tabletmanager"
"github.com/youtube/vitess/go/vt/tabletmanager/tmclient"
"github.com/youtube/vitess/go/vt/tabletserver"
"github.com/youtube/vitess/go/vt/tabletserver/grpcqueryservice"
Expand Down Expand Up @@ -63,7 +64,7 @@ func TestWaitForFilteredReplication_noFilteredReplication(t *testing.T) {
}

// TestWaitForFilteredReplication_unhealthy checks that
// vtctl WaitForFilteredReplication fails eventually when a tablet is not healthy.
// vtctl WaitForFilteredReplication fails when a tablet is not healthy.
func TestWaitForFilteredReplication_unhealthy(t *testing.T) {
unhealthy := &pbq.RealtimeStats{
HealthError: "WaitForFilteredReplication: unhealthy test",
Expand Down Expand Up @@ -94,6 +95,14 @@ func waitForFilteredReplication(t *testing.T, expectedErr string, initialStats *
ctx := context.Background()
wr.SetSourceShards(ctx, keyspace, destShard, []*pbt.TabletAlias{source.Tablet.GetAlias()}, nil)

// Set a BinlogPlayerMap to avoid a nil panic when the explicit RunHealthCheck
// is called by WaitForFilteredReplication.
// Note that for this test we don't mock the BinlogPlayerMap i.e. although
// its state says no filtered replication is running, the code under test will
// observe otherwise because we call SqlQuery.BroadcastHealth() directly and
// skip going through the tabletmanager's agent.
dest.Agent.BinlogPlayerMap = tabletmanager.NewBinlogPlayerMap(ts, nil, nil)

// Use real, but trimmed down QueryService.
testConfig := tabletserver.DefaultQsConfig
testConfig.EnablePublishStats = false
Expand Down