Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

check keyspace snapshot time if none specified for backup restores #13557

Merged
merged 7 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions go/test/endtoend/recovery/recovery_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,20 @@ func VerifyQueriesUsingVtgate(t *testing.T, session *vtgateconn.VTGateSession, q
}

// RestoreTablet performs a PITR restore.
func RestoreTablet(t *testing.T, localCluster *cluster.LocalProcessCluster, tablet *cluster.Vttablet, restoreKSName string, shardName string, keyspaceName string, commonTabletArg []string) {
func RestoreTablet(t *testing.T, localCluster *cluster.LocalProcessCluster, tablet *cluster.Vttablet, restoreKSName string, shardName string, keyspaceName string, commonTabletArg []string, restoreTime time.Time) {
tablet.ValidateTabletRestart(t)
replicaTabletArgs := commonTabletArg

_, err := localCluster.VtctlProcess.ExecuteCommandWithOutput("GetKeyspace", restoreKSName)

if restoreTime.IsZero() {
restoreTime = time.Now().UTC()
}

if err != nil {
tm := time.Now().UTC()
_, err := localCluster.VtctlProcess.ExecuteCommandWithOutput("CreateKeyspace", "--",
"--keyspace_type=SNAPSHOT", "--base_keyspace="+keyspaceName,
"--snapshot_time", tm.Format(time.RFC3339), restoreKSName)
"--snapshot_time", restoreTime.Format(time.RFC3339), restoreKSName)
require.Nil(t, err)
}

Expand Down
100 changes: 50 additions & 50 deletions go/test/endtoend/recovery/unshardedrecovery/recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,100 +185,100 @@ func TestMainImpl(m *testing.M) {
}

// TestRecoveryImpl does following
// - create a shard with primary and replica1 only
// - run InitShardPrimary
// - insert some data
// - take a backup
// - insert more data on the primary
// - take another backup
// - create a recovery keyspace after first backup
// - bring up tablet_replica2 in the new keyspace
// - check that new tablet does not have data created after backup1
// - create second recovery keyspace after second backup
// - bring up tablet_replica3 in second keyspace
// - check that new tablet has data created after backup1 but not data created after backup2
// - check that vtgate queries work correctly
// 1. create a shard with primary and replica1 only
// - run InitShardPrimary
// - insert some data
//
// 2. take a backup
// 3.create a recovery keyspace after first backup
// - bring up tablet_replica2 in the new keyspace
// - check that new tablet has data from backup1
//
// 4. insert more data on the primary
// 5. take another backup
// 6. create a recovery keyspace after second backup
// - bring up tablet_replica3 in the new keyspace
// - check that new tablet has data from backup2
pbibra marked this conversation as resolved.
Show resolved Hide resolved
//
// 7. insert more data on the primary
// 8. take another backup
// 9. create a recovery keyspace after second backup again
// - bring up tablet_replica4 in the new keyspace
// - check that new tablet has data from backup2 but not backup3
pbibra marked this conversation as resolved.
Show resolved Hide resolved
//
// 10. check that vtgate queries work correctly
func TestRecoveryImpl(t *testing.T) {
defer cluster.PanicHandler(t)
defer tabletsTeardown()
verifyInitialReplication(t)

// take first backup of value = test1
err := localCluster.VtctlclientProcess.ExecuteCommand("Backup", replica1.Alias)
assert.NoError(t, err)

backups := listBackups(t)
require.Equal(t, len(backups), 1)
assert.Contains(t, backups[0], replica1.Alias)

_, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test2')", keyspaceName, true)
assert.NoError(t, err)
cluster.VerifyRowsInTablet(t, replica1, keyspaceName, 2)

err = localCluster.VtctlclientProcess.ApplyVSchema(keyspaceName, vSchema)
assert.NoError(t, err)

output, err := localCluster.VtctlclientProcess.ExecuteCommandWithOutput("GetVSchema", keyspaceName)
assert.NoError(t, err)
assert.Contains(t, output, "vt_insert_test")

recovery.RestoreTablet(t, localCluster, replica2, recoveryKS1, "0", keyspaceName, commonTabletArg)
// restore with latest backup
restoreTime := time.Now().UTC()
recovery.RestoreTablet(t, localCluster, replica2, recoveryKS1, "0", keyspaceName, commonTabletArg, restoreTime)

output, err = localCluster.VtctlclientProcess.ExecuteCommandWithOutput("GetSrvVSchema", cell)
assert.NoError(t, err)
assert.Contains(t, output, keyspaceName)
assert.Contains(t, output, recoveryKS1)

err = localCluster.VtctlclientProcess.ExecuteCommand("GetSrvKeyspace", cell, keyspaceName)
assert.NoError(t, err)

output, err = localCluster.VtctlclientProcess.ExecuteCommandWithOutput("GetVSchema", recoveryKS1)
assert.NoError(t, err)
assert.Contains(t, output, "vt_insert_test")

cluster.VerifyRowsInTablet(t, replica2, keyspaceName, 1)

// verify that restored replica has value = test1
qr, err := replica2.VttabletProcess.QueryTablet("select msg from vt_insert_test where id = 1", keyspaceName, true)
assert.NoError(t, err)
assert.Equal(t, "test1", qr.Rows[0][0].ToString())

// insert new row on primary
_, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test2')", keyspaceName, true)
assert.NoError(t, err)
cluster.VerifyRowsInTablet(t, replica1, keyspaceName, 2)

// update the original row in primary
_, err = primary.VttabletProcess.QueryTablet("update vt_insert_test set msg = 'msgx1' where id = 1", keyspaceName, true)
assert.NoError(t, err)

// verify that primary has new value
qr, err := primary.VttabletProcess.QueryTablet("select msg from vt_insert_test where id = 1", keyspaceName, true)
qr, err = primary.VttabletProcess.QueryTablet("select msg from vt_insert_test where id = 1", keyspaceName, true)
assert.NoError(t, err)
assert.Equal(t, "msgx1", qr.Rows[0][0].ToString())

// verify that restored replica has old value
qr, err = replica2.VttabletProcess.QueryTablet("select msg from vt_insert_test where id = 1", keyspaceName, true)
assert.NoError(t, err)
assert.Equal(t, "test1", qr.Rows[0][0].ToString())

// take second backup of value = msgx1
err = localCluster.VtctlclientProcess.ExecuteCommand("Backup", replica1.Alias)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're missing validation that replica1 has the value "msgx1". The backup of replica1 might only contains "test1", which means the fact we restore and get "test1" does not indicate which backup we've restored.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I should add that the way to resolve this is to run a context.WithTimeout() + time.Ticker select loop waiting for the data to appear in replica1. Our experience shows that in GitHub CI things can be unbelievably slow. I'd say give this a timeout of 10s.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated!

assert.NoError(t, err)

_, err = primary.VttabletProcess.QueryTablet("insert into vt_insert_test (msg) values ('test3')", keyspaceName, true)
assert.NoError(t, err)
cluster.VerifyRowsInTablet(t, replica1, keyspaceName, 3)

recovery.RestoreTablet(t, localCluster, replica3, recoveryKS2, "0", keyspaceName, commonTabletArg)
// restore to first backup
recovery.RestoreTablet(t, localCluster, replica3, recoveryKS2, "0", keyspaceName, commonTabletArg, restoreTime)

output, err = localCluster.VtctlclientProcess.ExecuteCommandWithOutput("GetVSchema", recoveryKS2)
assert.NoError(t, err)
assert.Contains(t, output, "vt_insert_test")

cluster.VerifyRowsInTablet(t, replica3, keyspaceName, 2)

// update the original row in primary
_, err = primary.VttabletProcess.QueryTablet("update vt_insert_test set msg = 'msgx2' where id = 1", keyspaceName, true)
assert.NoError(t, err)

// verify that primary has new value
qr, err = primary.VttabletProcess.QueryTablet("select msg from vt_insert_test where id = 1", keyspaceName, true)
assert.NoError(t, err)
assert.Equal(t, "msgx2", qr.Rows[0][0].ToString())
// only one row from first backup
cluster.VerifyRowsInTablet(t, replica3, keyspaceName, 1)

// verify that restored replica has old value
//verify that restored replica has value = test1
qr, err = replica3.VttabletProcess.QueryTablet("select msg from vt_insert_test where id = 1", keyspaceName, true)
assert.NoError(t, err)
assert.Equal(t, "msgx1", qr.Rows[0][0].ToString())
assert.Equal(t, "test1", qr.Rows[0][0].ToString())

vtgateInstance := localCluster.NewVtgateInstance()
vtgateInstance.TabletTypesToWait = "REPLICA"
Expand All @@ -299,26 +299,26 @@ func TestRecoveryImpl(t *testing.T) {
session := vtgateConn.Session("@replica", nil)

// check that vtgate doesn't route queries to new tablet
recovery.VerifyQueriesUsingVtgate(t, session, "select count(*) from vt_insert_test", "INT64(3)")
recovery.VerifyQueriesUsingVtgate(t, session, "select msg from vt_insert_test where id = 1", `VARCHAR("msgx2")`)
recovery.VerifyQueriesUsingVtgate(t, session, "select count(*) from vt_insert_test", "INT64(2)")
recovery.VerifyQueriesUsingVtgate(t, session, "select msg from vt_insert_test where id = 1", `VARCHAR("msgx1")`)
recovery.VerifyQueriesUsingVtgate(t, session, fmt.Sprintf("select count(*) from %s.vt_insert_test", recoveryKS1), "INT64(1)")
recovery.VerifyQueriesUsingVtgate(t, session, fmt.Sprintf("select msg from %s.vt_insert_test where id = 1", recoveryKS1), `VARCHAR("test1")`)
recovery.VerifyQueriesUsingVtgate(t, session, fmt.Sprintf("select count(*) from %s.vt_insert_test", recoveryKS2), "INT64(2)")
recovery.VerifyQueriesUsingVtgate(t, session, fmt.Sprintf("select msg from %s.vt_insert_test where id = 1", recoveryKS2), `VARCHAR("msgx1")`)
recovery.VerifyQueriesUsingVtgate(t, session, fmt.Sprintf("select count(*) from %s.vt_insert_test", recoveryKS2), "INT64(1)")
recovery.VerifyQueriesUsingVtgate(t, session, fmt.Sprintf("select msg from %s.vt_insert_test where id = 1", recoveryKS2), `VARCHAR("test1")`)

// check that new keyspace is accessible with 'use ks'
cluster.ExecuteQueriesUsingVtgate(t, session, "use "+recoveryKS1+"@replica")
recovery.VerifyQueriesUsingVtgate(t, session, "select count(*) from vt_insert_test", "INT64(1)")

cluster.ExecuteQueriesUsingVtgate(t, session, "use "+recoveryKS2+"@replica")
recovery.VerifyQueriesUsingVtgate(t, session, "select count(*) from vt_insert_test", "INT64(2)")
recovery.VerifyQueriesUsingVtgate(t, session, "select count(*) from vt_insert_test", "INT64(1)")

// check that new tablet is accessible with use `ks:shard`
cluster.ExecuteQueriesUsingVtgate(t, session, "use `"+recoveryKS1+":0@replica`")
recovery.VerifyQueriesUsingVtgate(t, session, "select count(*) from vt_insert_test", "INT64(1)")

cluster.ExecuteQueriesUsingVtgate(t, session, "use `"+recoveryKS2+":0@replica`")
recovery.VerifyQueriesUsingVtgate(t, session, "select count(*) from vt_insert_test", "INT64(2)")
recovery.VerifyQueriesUsingVtgate(t, session, "select count(*) from vt_insert_test", "INT64(1)")
}

// verifyInitialReplication will create schema in primary, insert some data to primary and verify the same data in replica.
Expand Down
7 changes: 6 additions & 1 deletion go/vt/vttablet/tabletmanager/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,11 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
log.Infof("Using base_keyspace %v to restore keyspace %v using a backup time of %v", keyspace, tablet.Keyspace, logutil.ProtoToTime(request.BackupTime))
}

startTime := logutil.ProtoToTime(request.BackupTime)
if startTime.IsZero() {
startTime = logutil.ProtoToTime(keyspaceInfo.SnapshotTime)
}

params := mysqlctl.RestoreParams{
Cnf: tm.Cnf,
Mysqld: tm.MysqlDaemon,
Expand All @@ -195,7 +200,7 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
DbName: topoproto.TabletDbName(tablet),
Keyspace: keyspace,
Shard: tablet.Shard,
StartTime: logutil.ProtoToTime(request.BackupTime),
StartTime: startTime,
DryRun: request.DryRun,
Stats: backupstats.RestoreStats(),
}
Expand Down