From 44fd2e4354fccda0e90afae67733bfbae384d3b4 Mon Sep 17 00:00:00 2001 From: weekface Date: Wed, 3 Apr 2019 13:52:45 +0800 Subject: [PATCH 1/4] refactor stability main function --- .gitignore | 1 + tests/actions.go | 402 ++++-------- tests/backup/backupcase.go | 23 +- tests/cluster_info.go | 18 +- tests/cmd/e2e/main.go | 28 +- tests/cmd/stability/main.go | 587 +++++------------- tests/config.go | 38 +- tests/failover.go | 269 ++++++++ tests/fault.go | 135 +++- tests/log_dump.go | 2 +- .../stability/stability-configmap.yaml | 32 +- tests/manifests/stability/stability.yaml | 9 +- tests/pkg/blockwriter/blockwriter.go | 8 +- tests/pkg/client/client.go | 29 + tests/pkg/fault-trigger/manager/types.go | 11 +- tests/pkg/fault-trigger/manager/vm.go | 3 +- tests/pkg/fault-trigger/manager/vm_test.go | 11 +- tests/pkg/workload/ddl/internal/ddl.go | 20 +- tests/pkg/workload/ddl/internal/ddl_ops.go | 8 +- tests/pkg/workload/ddl/internal/dml_ops.go | 6 +- tests/pkg/workload/ddl/internal/run.go | 4 +- tests/util.go | 41 -- 22 files changed, 834 insertions(+), 851 deletions(-) create mode 100644 tests/failover.go create mode 100644 tests/pkg/client/client.go delete mode 100644 tests/util.go diff --git a/.gitignore b/.gitignore index 31fb962bf4..e8c41829a9 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ tests/images/stability-test/bin/ tests/images/e2e/bin/ tests/images/fault-trigger/bin/ tests/images/e2e/tidb-cluster/ +tests/images/e2e/tidb-backup/ tests/images/e2e/tidb-operator/ *.tar tmp/ diff --git a/tests/actions.go b/tests/actions.go index e108b52afe..11bd9ac478 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -21,7 +21,6 @@ import ( "net/http" "net/url" "os/exec" - "sort" "strconv" "strings" "time" @@ -67,42 +66,51 @@ func NewOperatorActions(cli versioned.Interface, kubeCli kubernetes.Interface, c const ( DefaultPollTimeout time.Duration = 10 * time.Minute - DefaultPollInterval time.Duration = 10 * time.Second + DefaultPollInterval time.Duration = 1 * time.Minute getBackupDirPodName = "get-backup-dir" grafanaUsername = "admin" grafanaPassword = "admin" ) type OperatorActions interface { - DeployOperator(info *OperatorInfo) error - CleanOperator(info *OperatorInfo) error - UpgradeOperator(info *OperatorInfo) error - DumpAllLogs(info *OperatorInfo, clusterInfos []*TidbClusterInfo) error - DeployTidbCluster(info *TidbClusterInfo) error - CleanTidbCluster(info *TidbClusterInfo) error - CheckTidbClusterStatus(info *TidbClusterInfo) error - BeginInsertDataTo(info *TidbClusterInfo) error - StopInsertDataTo(info *TidbClusterInfo) error - ScaleTidbCluster(info *TidbClusterInfo) error - CheckScaleInSafely(info *TidbClusterInfo) error - CheckScaledCorrectly(info *TidbClusterInfo, podUIDsBeforeScale map[string]types.UID) error - UpgradeTidbCluster(info *TidbClusterInfo) error - CheckUpgradeProgress(info *TidbClusterInfo) error - DeployAdHocBackup(info *TidbClusterInfo) error - CheckAdHocBackup(info *TidbClusterInfo) error - DeployScheduledBackup(info *TidbClusterInfo) error - CheckScheduledBackup(info *TidbClusterInfo) error - DeployIncrementalBackup(from *TidbClusterInfo, to *TidbClusterInfo) error - CheckIncrementalBackup(info *TidbClusterInfo) error - Restore(from *TidbClusterInfo, to *TidbClusterInfo) error - CheckRestore(from *TidbClusterInfo, to *TidbClusterInfo) error - ForceDeploy(info *TidbClusterInfo) error - CreateSecret(info *TidbClusterInfo) error - GetPodUIDMap(info *TidbClusterInfo) (map[string]types.UID, error) - GetNodeMap(info *TidbClusterInfo, component string) (map[string][]string, error) - getBackupDir(info *TidbClusterInfo) ([]string, error) - PendingFailover(info *TidbClusterInfo, faultPoint *time.Time) (bool, error) - CheckFailover(info *TidbClusterInfo, faultNode string) (bool, error) + DeployOperator(info *OperatorConfig) error + DeployOperatorOrDie(info *OperatorConfig) + CleanOperator(info *OperatorConfig) error + CleanOperatorOrDie(info *OperatorConfig) + UpgradeOperator(info *OperatorConfig) error + DumpAllLogs(info *OperatorConfig, clusterInfos []*TidbClusterConfig) error + DeployTidbCluster(info *TidbClusterConfig) error + DeployTidbClusterOrDie(info *TidbClusterConfig) + CleanTidbCluster(info *TidbClusterConfig) error + CleanTidbClusterOrDie(info *TidbClusterConfig) + CheckTidbClusterStatus(info *TidbClusterConfig) error + CheckTidbClusterStatusOrDie(info *TidbClusterConfig) + BeginInsertDataTo(info *TidbClusterConfig) error + StopInsertDataTo(info *TidbClusterConfig) error + ScaleTidbCluster(info *TidbClusterConfig) error + ScaleTidbClusterOrDie(info *TidbClusterConfig) + CheckScaleInSafely(info *TidbClusterConfig) error + CheckScaledCorrectly(info *TidbClusterConfig, podUIDsBeforeScale map[string]types.UID) error + UpgradeTidbCluster(info *TidbClusterConfig) error + UpgradeTidbClusterOrDie(info *TidbClusterConfig) + CheckUpgradeProgress(info *TidbClusterConfig) error + DeployAdHocBackup(info *TidbClusterConfig) error + CheckAdHocBackup(info *TidbClusterConfig) error + DeployScheduledBackup(info *TidbClusterConfig) error + CheckScheduledBackup(info *TidbClusterConfig) error + DeployIncrementalBackup(from *TidbClusterConfig, to *TidbClusterConfig) error + CheckIncrementalBackup(info *TidbClusterConfig) error + Restore(from *TidbClusterConfig, to *TidbClusterConfig) error + CheckRestore(from *TidbClusterConfig, to *TidbClusterConfig) error + ForceDeploy(info *TidbClusterConfig) error + CreateSecret(info *TidbClusterConfig) error + GetPodUIDMap(info *TidbClusterConfig) (map[string]types.UID, error) + GetNodeMap(info *TidbClusterConfig, component string) (map[string][]string, error) + getBackupDir(info *TidbClusterConfig) ([]string, error) + CheckFailoverPending(info *TidbClusterConfig, faultPoint *time.Time) (bool, error) + CheckFailoverPendingOrDie(clusters []*TidbClusterConfig, faultPoint *time.Time) + CheckFailover(info *TidbClusterConfig, faultNode string) (bool, error) + CheckFailoverOrDie(clusters []*TidbClusterConfig, faultNode string) } type operatorActions struct { @@ -114,7 +122,7 @@ type operatorActions struct { var _ = OperatorActions(&operatorActions{}) -type OperatorInfo struct { +type OperatorConfig struct { Namespace string ReleaseName string Image string @@ -124,7 +132,7 @@ type OperatorInfo struct { LogLevel string } -type TidbClusterInfo struct { +type TidbClusterConfig struct { BackupPVC string Namespace string ClusterName string @@ -146,7 +154,7 @@ type TidbClusterInfo struct { BackupSecretName string } -func (tc *TidbClusterInfo) BackupHelmSetString(m map[string]string) string { +func (tc *TidbClusterConfig) BackupHelmSetString(m map[string]string) string { set := map[string]string{ "clusterName": tc.ClusterName, @@ -167,7 +175,7 @@ func (tc *TidbClusterInfo) BackupHelmSetString(m map[string]string) string { return strings.Join(arr, ",") } -func (tc *TidbClusterInfo) TidbClusterHelmSetString(m map[string]string) string { +func (tc *TidbClusterConfig) TidbClusterHelmSetString(m map[string]string) string { set := map[string]string{ "clusterName": tc.ClusterName, @@ -201,7 +209,7 @@ func (tc *TidbClusterInfo) TidbClusterHelmSetString(m map[string]string) string return strings.Join(arr, ",") } -func (oi *OperatorInfo) OperatorHelmSetString(m map[string]string) string { +func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { set := map[string]string{ "operatorImage": oi.Image, "controllerManager.autoFailover": "true", @@ -220,7 +228,7 @@ func (oi *OperatorInfo) OperatorHelmSetString(m map[string]string) string { return strings.Join(arr, ",") } -func (oa *operatorActions) DeployOperator(info *OperatorInfo) error { +func (oa *operatorActions) DeployOperator(info *OperatorConfig) error { if info.Tag != "e2e" { if err := cloneOperatorRepo(); err != nil { return err @@ -247,7 +255,13 @@ func (oa *operatorActions) DeployOperator(info *OperatorInfo) error { return nil } -func (oa *operatorActions) CleanOperator(info *OperatorInfo) error { +func (oa *operatorActions) DeployOperatorOrDie(info *OperatorConfig) { + if err := oa.DeployOperator(info); err != nil { + panic(err) + } +} + +func (oa *operatorActions) CleanOperator(info *OperatorConfig) error { res, err := exec.Command("helm", "del", "--purge", info.ReleaseName).CombinedOutput() if err == nil || !releaseIsNotFound(err) { return nil @@ -255,7 +269,13 @@ func (oa *operatorActions) CleanOperator(info *OperatorInfo) error { return fmt.Errorf("failed to clear operator: %v, %s", err, string(res)) } -func (oa *operatorActions) UpgradeOperator(info *OperatorInfo) error { +func (oa *operatorActions) CleanOperatorOrDie(info *OperatorConfig) { + if err := oa.CleanOperator(info); err != nil { + panic(err) + } +} + +func (oa *operatorActions) UpgradeOperator(info *OperatorConfig) error { if err := checkoutTag(info.Tag); err != nil { return err } @@ -271,7 +291,7 @@ func (oa *operatorActions) UpgradeOperator(info *OperatorInfo) error { return nil } -func (oa *operatorActions) DeployTidbCluster(info *TidbClusterInfo) error { +func (oa *operatorActions) DeployTidbCluster(info *TidbClusterConfig) error { glog.Infof("begin to deploy tidb cluster cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) defer func() { glog.Infof("deploy tidb cluster end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) @@ -310,7 +330,13 @@ func (oa *operatorActions) DeployTidbCluster(info *TidbClusterInfo) error { return nil } -func (oa *operatorActions) CleanTidbCluster(info *TidbClusterInfo) error { +func (oa *operatorActions) DeployTidbClusterOrDie(info *TidbClusterConfig) { + if err := oa.DeployTidbCluster(info); err != nil { + panic(err) + } +} + +func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { glog.Infof("begin to clean tidb cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) defer func() { glog.Infof("clean tidb cluster end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) @@ -382,7 +408,13 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterInfo) error { return wait.PollImmediate(DefaultPollInterval, DefaultPollTimeout, pollFn) } -func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterInfo) error { +func (oa *operatorActions) CleanTidbClusterOrDie(info *TidbClusterConfig) { + if err := oa.CleanTidbCluster(info); err != nil { + panic(err) + } +} + +func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error { glog.Infof("begin to check tidb cluster cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) defer func() { glog.Infof("check tidb cluster end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) @@ -446,7 +478,13 @@ func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterInfo) error { return nil } -func (oa *operatorActions) BeginInsertDataTo(info *TidbClusterInfo) error { +func (oa *operatorActions) CheckTidbClusterStatusOrDie(info *TidbClusterConfig) { + if err := oa.CheckTidbClusterStatus(info); err != nil { + panic(err) + } +} + +func (oa *operatorActions) BeginInsertDataTo(info *TidbClusterConfig) error { dsn := getDSN(info.Namespace, info.ClusterName, "test", info.Password) db, err := util.OpenDB(dsn, defaultConcurrency) if err != nil { @@ -456,7 +494,7 @@ func (oa *operatorActions) BeginInsertDataTo(info *TidbClusterInfo) error { return info.blockWriter.Start(db) } -func (oa *operatorActions) StopInsertDataTo(info *TidbClusterInfo) error { +func (oa *operatorActions) StopInsertDataTo(info *TidbClusterConfig) error { info.blockWriter.Stop() return nil } @@ -465,7 +503,7 @@ func chartPath(name string, tag string) string { return "/charts/" + tag + "/" + name } -func (oa *operatorActions) ScaleTidbCluster(info *TidbClusterInfo) error { +func (oa *operatorActions) ScaleTidbCluster(info *TidbClusterConfig) error { cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s", info.ClusterName, chartPath("tidb-cluster", info.OperatorTag), info.TidbClusterHelmSetString(nil)) glog.Info("[SCALE] " + cmd) @@ -476,7 +514,13 @@ func (oa *operatorActions) ScaleTidbCluster(info *TidbClusterInfo) error { return nil } -func (oa *operatorActions) CheckScaleInSafely(info *TidbClusterInfo) error { +func (oa *operatorActions) ScaleTidbClusterOrDie(info *TidbClusterConfig) { + if err := oa.ScaleTidbCluster(info); err != nil { + panic(err) + } +} + +func (oa *operatorActions) CheckScaleInSafely(info *TidbClusterConfig) error { return wait.Poll(DefaultPollInterval, DefaultPollTimeout, func() (done bool, err error) { tc, err := oa.cli.PingcapV1alpha1().TidbClusters(info.Namespace).Get(info.ClusterName, metav1.GetOptions{}) if err != nil { @@ -511,7 +555,7 @@ func (oa *operatorActions) CheckScaleInSafely(info *TidbClusterInfo) error { }) } -func (oa *operatorActions) CheckScaledCorrectly(info *TidbClusterInfo, podUIDsBeforeScale map[string]types.UID) error { +func (oa *operatorActions) CheckScaledCorrectly(info *TidbClusterConfig, podUIDsBeforeScale map[string]types.UID) error { return wait.Poll(DefaultPollInterval, DefaultPollTimeout, func() (done bool, err error) { podUIDs, err := oa.GetPodUIDMap(info) if err != nil { @@ -533,7 +577,7 @@ func (oa *operatorActions) CheckScaledCorrectly(info *TidbClusterInfo, podUIDsBe }) } -func (oa *operatorActions) UpgradeTidbCluster(info *TidbClusterInfo) error { +func (oa *operatorActions) UpgradeTidbCluster(info *TidbClusterConfig) error { cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s", info.ClusterName, chartPath("tidb-cluster", info.OperatorTag), info.TidbClusterHelmSetString(nil)) glog.Info("[UPGRADE] " + cmd) @@ -544,7 +588,13 @@ func (oa *operatorActions) UpgradeTidbCluster(info *TidbClusterInfo) error { return nil } -func (oa *operatorActions) CheckUpgradeProgress(info *TidbClusterInfo) error { +func (oa *operatorActions) UpgradeTidbClusterOrDie(info *TidbClusterConfig) { + if err := oa.UpgradeTidbCluster(info); err != nil { + panic(err) + } +} + +func (oa *operatorActions) CheckUpgradeProgress(info *TidbClusterConfig) error { return wait.Poll(DefaultPollInterval, DefaultPollTimeout, func() (done bool, err error) { tc, err := oa.cli.PingcapV1alpha1().TidbClusters(info.Namespace).Get(info.ClusterName, metav1.GetOptions{}) if err != nil { @@ -684,8 +734,8 @@ func (oa *operatorActions) CheckUpgradeProgress(info *TidbClusterInfo) error { }) } -func (oa *operatorActions) DeployMonitor(info *TidbClusterInfo) error { return nil } -func (oa *operatorActions) CleanMonitor(info *TidbClusterInfo) error { return nil } +func (oa *operatorActions) DeployMonitor(info *TidbClusterConfig) error { return nil } +func (oa *operatorActions) CleanMonitor(info *TidbClusterConfig) error { return nil } func getComponentContainer(set *v1beta1.StatefulSet) (corev1.Container, bool) { name := set.Labels[label.ComponentLabelKey] @@ -1134,7 +1184,7 @@ func (oa *operatorActions) schedulerHAFn(tc *v1alpha1.TidbCluster) (bool, error) return true, nil } -func (oa *operatorActions) passwordIsSet(clusterInfo *TidbClusterInfo) (bool, error) { +func (oa *operatorActions) passwordIsSet(clusterInfo *TidbClusterConfig) (bool, error) { ns := clusterInfo.Namespace tcName := clusterInfo.ClusterName jobName := tcName + "-tidb-initializer" @@ -1165,7 +1215,7 @@ func (oa *operatorActions) passwordIsSet(clusterInfo *TidbClusterInfo) (bool, er return true, nil } -func (oa *operatorActions) monitorNormal(clusterInfo *TidbClusterInfo) (bool, error) { +func (oa *operatorActions) monitorNormal(clusterInfo *TidbClusterConfig) (bool, error) { ns := clusterInfo.Namespace tcName := clusterInfo.ClusterName monitorDeploymentName := fmt.Sprintf("%s-monitor", tcName) @@ -1201,7 +1251,7 @@ func (oa *operatorActions) monitorNormal(clusterInfo *TidbClusterInfo) (bool, er return true, nil } -func (oa *operatorActions) checkPrometheus(clusterInfo *TidbClusterInfo) error { +func (oa *operatorActions) checkPrometheus(clusterInfo *TidbClusterConfig) error { ns := clusterInfo.Namespace tcName := clusterInfo.ClusterName prometheusSvc := fmt.Sprintf("http://%s-prometheus.%s:9090/api/v1/query?query=up", tcName, ns) @@ -1227,7 +1277,7 @@ func (oa *operatorActions) checkPrometheus(clusterInfo *TidbClusterInfo) error { return nil } -func (oa *operatorActions) checkGrafanaData(clusterInfo *TidbClusterInfo) error { +func (oa *operatorActions) checkGrafanaData(clusterInfo *TidbClusterConfig) error { ns := clusterInfo.Namespace tcName := clusterInfo.ClusterName svcName := fmt.Sprintf("%s-grafana", tcName) @@ -1312,7 +1362,7 @@ func checkoutTag(tagName string) error { return nil } -func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterInfo) error { +func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterConfig) error { glog.Infof("begin to deploy adhoc backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) defer func() { glog.Infof("deploy adhoc backup end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) @@ -1339,7 +1389,7 @@ func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterInfo) error { return nil } -func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterInfo) error { +func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterConfig) error { glog.Infof("begin to clean adhoc backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) defer func() { glog.Infof("deploy clean backup end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) @@ -1368,7 +1418,7 @@ func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterInfo) error { return nil } -func (oa *operatorActions) Restore(from *TidbClusterInfo, to *TidbClusterInfo) error { +func (oa *operatorActions) Restore(from *TidbClusterConfig, to *TidbClusterConfig) error { glog.Infof("begin to deploy restore cluster[%s] namespace[%s]", from.ClusterName, from.Namespace) defer func() { glog.Infof("deploy restore end cluster[%s] namespace[%s]", to.ClusterName, to.Namespace) @@ -1395,7 +1445,7 @@ func (oa *operatorActions) Restore(from *TidbClusterInfo, to *TidbClusterInfo) e return nil } -func (oa *operatorActions) CheckRestore(from *TidbClusterInfo, to *TidbClusterInfo) error { +func (oa *operatorActions) CheckRestore(from *TidbClusterConfig, to *TidbClusterConfig) error { glog.Infof("begin to check restore backup cluster[%s] namespace[%s]", from.ClusterName, from.Namespace) defer func() { glog.Infof("check restore end cluster[%s] namespace[%s]", to.ClusterName, to.Namespace) @@ -1440,7 +1490,7 @@ func (oa *operatorActions) CheckRestore(from *TidbClusterInfo, to *TidbClusterIn return nil } -func (oa *operatorActions) ForceDeploy(info *TidbClusterInfo) error { +func (oa *operatorActions) ForceDeploy(info *TidbClusterConfig) error { if err := oa.CleanTidbCluster(info); err != nil { return err } @@ -1452,7 +1502,7 @@ func (oa *operatorActions) ForceDeploy(info *TidbClusterInfo) error { return nil } -func (info *TidbClusterInfo) QueryCount() (int, error) { +func (info *TidbClusterConfig) QueryCount() (int, error) { tableName := "test" db, err := sql.Open("mysql", getDSN(info.Namespace, info.ClusterName, "record", info.Password)) if err != nil { @@ -1477,7 +1527,7 @@ func (info *TidbClusterInfo) QueryCount() (int, error) { return 0, fmt.Errorf("can not find count of ") } -func (oa *operatorActions) CreateSecret(info *TidbClusterInfo) error { +func (oa *operatorActions) CreateSecret(info *TidbClusterConfig) error { initSecret := corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Name: info.InitSecretName, @@ -1518,7 +1568,7 @@ func releaseIsExist(err error) bool { return strings.Contains(err.Error(), "already exists") } -func (oa *operatorActions) DeployScheduledBackup(info *TidbClusterInfo) error { +func (oa *operatorActions) DeployScheduledBackup(info *TidbClusterConfig) error { glog.Infof("begin to deploy scheduled backup") defer func() { glog.Infof("deploy shceduled backup end") @@ -1548,7 +1598,7 @@ func (oa *operatorActions) DeployScheduledBackup(info *TidbClusterInfo) error { return nil } -func (oa *operatorActions) CheckScheduledBackup(info *TidbClusterInfo) error { +func (oa *operatorActions) CheckScheduledBackup(info *TidbClusterConfig) error { glog.Infof("begin to check scheduler backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) defer func() { glog.Infof("deploy check scheduler end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) @@ -1625,7 +1675,7 @@ func getParentUIDFromJob(j batchv1.Job) (types.UID, bool) { return controllerRef.UID, true } -func (oa *operatorActions) getBackupDir(info *TidbClusterInfo) ([]string, error) { +func (oa *operatorActions) getBackupDir(info *TidbClusterConfig) ([]string, error) { scheduledPvcName := fmt.Sprintf("%s-scheduled-backup", info.ClusterName) pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ @@ -1708,11 +1758,11 @@ func (oa *operatorActions) getBackupDir(info *TidbClusterInfo) ([]string, error) return dirs, nil } -func (info *TidbClusterInfo) FullName() string { +func (info *TidbClusterConfig) FullName() string { return fmt.Sprintf("%s/%s", info.Namespace, info.ClusterName) } -func (oa *operatorActions) DeployIncrementalBackup(from *TidbClusterInfo, to *TidbClusterInfo) error { +func (oa *operatorActions) DeployIncrementalBackup(from *TidbClusterConfig, to *TidbClusterConfig) error { glog.Infof("begin to deploy incremental backup cluster[%s] namespace[%s]", from.ClusterName, from.Namespace) defer func() { glog.Infof("deploy incremental backup end cluster[%s] namespace[%s]", to.ClusterName, to.Namespace) @@ -1739,7 +1789,7 @@ func (oa *operatorActions) DeployIncrementalBackup(from *TidbClusterInfo, to *Ti return nil } -func (oa *operatorActions) CheckIncrementalBackup(info *TidbClusterInfo) error { +func (oa *operatorActions) CheckIncrementalBackup(info *TidbClusterConfig) error { glog.Infof("begin to check incremental backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) defer func() { glog.Infof("check incremental backup end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) @@ -1822,7 +1872,7 @@ type nodeStatus struct { State string `json:"state"` } -func (oa *operatorActions) pumpHealth(info *TidbClusterInfo, hostName string) bool { +func (oa *operatorActions) pumpHealth(info *TidbClusterConfig, hostName string) bool { pumpHealthUrl := fmt.Sprintf("%s.%s-pump.%s:8250/status", hostName, info.ClusterName, info.Namespace) res, err := http.Get(pumpHealthUrl) if err != nil { @@ -1860,7 +1910,7 @@ type drainerStatus struct { TsMap string `json:"TsMap"` } -func (oa *operatorActions) drainerHealth(info *TidbClusterInfo, hostName string) bool { +func (oa *operatorActions) drainerHealth(info *TidbClusterConfig, hostName string) bool { drainerHealthUrl := fmt.Sprintf("%s.%s-drainer.%s:8249/status", hostName, info.ClusterName, info.Namespace) res, err := http.Get(drainerHealthUrl) if err != nil { @@ -1884,215 +1934,3 @@ func (oa *operatorActions) drainerHealth(info *TidbClusterInfo, hostName string) } return len(healths.PumpPos) > 0 && healths.Synced } - -func (oa *operatorActions) PendingFailover(info *TidbClusterInfo, faultPoint *time.Time) (bool, error) { - tc, err := oa.cli.PingcapV1alpha1().TidbClusters(info.Namespace).Get(info.ClusterName, metav1.GetOptions{}) - if err != nil { - glog.Infof("pending failover,failed to get tidbcluster:[%s], error: %v", info.FullName(), err) - if strings.Contains(err.Error(), "Client.Timeout exceeded while awaiting headers") { - glog.Info("create new client") - newCli, _, err := CreateKubeClient() - if err != nil { - glog.Errorf("create new client failed, error:%v", err) - return false, nil - } - oa.cli = newCli - } - return false, nil - } - deadline := faultPoint.Add(period) - if time.Now().Before(deadline) { - if tc.Status.PD.FailureMembers != nil && len(tc.Status.PD.FailureMembers) > 0 { - err := fmt.Errorf("cluster: [%s] the pd member should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339)) - glog.Errorf(err.Error()) - return false, err - } - if tc.Status.TiKV.FailureStores != nil && len(tc.Status.TiKV.FailureStores) > 0 { - err := fmt.Errorf("cluster: [%s] the tikv store should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339)) - glog.Errorf(err.Error()) - return false, err - } - if tc.Status.TiDB.FailureMembers != nil && len(tc.Status.TiDB.FailureMembers) > 0 { - err := fmt.Errorf("cluster: [%s] the tidb member should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339)) - glog.Errorf(err.Error()) - return false, err - } - - glog.Infof("cluster: [%s] operator's failover feature is pending", info.FullName()) - return false, nil - } - return true, nil -} - -func (oa *operatorActions) CheckFailover(info *TidbClusterInfo, node string) (bool, error) { - selector, err := label.New().Instance(info.ClusterName).Selector() - if err != nil { - glog.Errorf("cluster:[%s] create selector failed, error:%v", info.FullName(), err) - return false, nil - } - pods, err := oa.kubeCli.CoreV1().Pods(info.Namespace).List(metav1.ListOptions{LabelSelector: selector.String()}) - if err != nil { - glog.Errorf("cluster:[%s] query pods failed, error:%v", info.FullName(), err) - return false, nil - } - - affectedPods := map[string]*corev1.Pod{} - for i, pod := range pods.Items { - if pod.Spec.NodeName == node { - affectedPods[pod.Name] = &pods.Items[i] - } - } - if len(affectedPods) == 0 { - glog.Infof("the cluster:[%s] can not be affected by node:[%s]", info.FullName(), node) - return true, nil - } - - tc, err := oa.cli.PingcapV1alpha1().TidbClusters(info.Namespace).Get(info.ClusterName, metav1.GetOptions{}) - if err != nil { - glog.Errorf("query tidbcluster: [%s] failed, error: %v", info.FullName(), err) - return false, nil - } - - for _, affectedPod := range affectedPods { - switch affectedPod.Labels[label.ComponentLabelKey] { - case label.PDLabelVal: - if !oa.pdFailover(affectedPod, tc) { - return false, nil - } - case label.TiKVLabelVal: - if !oa.tikvFailover(affectedPod, tc) { - return false, nil - } - case label.TiDBLabelVal: - if !oa.tidbFailover(affectedPod, tc) { - return false, nil - } - } - } - - glog.Infof("cluster: [%s]'s failover feature has complete", info.FullName()) - return true, nil -} - -func (oa *operatorActions) pdFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool { - failure := false - for _, failureMember := range tc.Status.PD.FailureMembers { - if failureMember.PodName == pod.GetName() { - failure = true - break - } - } - if !failure { - glog.Infof("tidbCluster:[%s/%s]'s member:[%s] have not become failuremember", tc.Namespace, tc.Name, pod.Name) - return false - } - - for _, member := range tc.Status.PD.Members { - if member.Name == pod.GetName() { - glog.Infof("tidbCluster:[%s/%s]'s status.members still have pd member:[%s]", tc.Namespace, tc.Name, pod.Name) - return false - } - } - - if tc.Status.PD.Synced && len(tc.Status.PD.Members) == int(tc.Spec.PD.Replicas) { - return true - } - - glog.Infof("cluster: [%s/%s] pd:[%s] failover still not complete", tc.Namespace, tc.Name, pod.GetName()) - - return false -} - -func (oa *operatorActions) tikvFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool { - failure := false - for _, failureStore := range tc.Status.TiKV.FailureStores { - if failureStore.PodName == pod.GetName() { - failure = true - break - } - } - if !failure { - glog.Infof("tidbCluster:[%s/%s]'s store pod:[%s] have not become failuremember", tc.Namespace, tc.Name, pod.Name) - return false - } - - healthCount := 0 - for _, store := range tc.Status.TiKV.Stores { - if store.State == v1alpha1.TiKVStateUp { - healthCount++ - } - } - if tc.Status.TiKV.Synced && healthCount == int(tc.Spec.TiKV.Replicas) { - return true - } - - glog.Infof("cluster: [%s/%s] tikv:[%s] failover still not complete", tc.Namespace, tc.Name, pod.GetName()) - return false -} - -func (oa *operatorActions) tidbFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool { - failure := false - for _, failureMember := range tc.Status.TiDB.FailureMembers { - if failureMember.PodName == pod.GetName() { - glog.Infof("tidbCluster:[%s/%s]'s store pod:[%s] have not become failuremember", tc.Namespace, tc.Name, pod.Name) - failure = true - break - } - } - if !failure { - return false - } - - healthCount := 0 - for _, member := range tc.Status.TiDB.Members { - if member.Health { - healthCount++ - } - } - - if healthCount == int(tc.Spec.TiDB.Replicas) { - return true - } - glog.Infof("cluster: [%s/%s] tidb:[%s] failover still not complete", tc.Namespace, tc.Name, pod.GetName()) - return false -} - -func (oa *operatorActions) GetPodUIDMap(info *TidbClusterInfo) (map[string]types.UID, error) { - result := map[string]types.UID{} - - selector, err := label.New().Instance(info.ClusterName).Selector() - if err != nil { - return nil, err - } - pods, err := oa.kubeCli.CoreV1().Pods(info.Namespace).List(metav1.ListOptions{LabelSelector: selector.String()}) - if err != nil { - return nil, err - } - for _, pod := range pods.Items { - result[pod.GetName()] = pod.GetUID() - } - - return result, nil -} - -func (oa *operatorActions) GetNodeMap(info *TidbClusterInfo, component string) (map[string][]string, error) { - nodeMap := make(map[string][]string) - selector := label.New().Instance(info.ClusterName).Component(component).Labels() - podList, err := oa.kubeCli.CoreV1().Pods(info.Namespace).List(metav1.ListOptions{ - LabelSelector: labels.SelectorFromSet(selector).String(), - }) - if err != nil { - return nil, err - } - - for _, pod := range podList.Items { - nodeName := pod.Spec.NodeName - if len(nodeMap[nodeName]) == 0 { - nodeMap[nodeName] = make([]string, 0) - } - nodeMap[nodeName] = append(nodeMap[nodeName], pod.GetName()) - sort.Strings(nodeMap[nodeName]) - } - - return nodeMap, nil -} diff --git a/tests/backup/backupcase.go b/tests/backup/backupcase.go index 7581c00ed6..71b13c6b9c 100644 --- a/tests/backup/backupcase.go +++ b/tests/backup/backupcase.go @@ -23,11 +23,11 @@ import ( type BackupCase struct { operator tests.OperatorActions - srcCluster *tests.TidbClusterInfo - desCluster *tests.TidbClusterInfo + srcCluster *tests.TidbClusterConfig + desCluster *tests.TidbClusterConfig } -func NewBackupCase(operator tests.OperatorActions, srcCluster *tests.TidbClusterInfo, desCluster *tests.TidbClusterInfo) *BackupCase { +func NewBackupCase(operator tests.OperatorActions, srcCluster *tests.TidbClusterConfig, desCluster *tests.TidbClusterConfig) *BackupCase { return &BackupCase{ operator: operator, srcCluster: srcCluster, @@ -36,6 +36,11 @@ func NewBackupCase(operator tests.OperatorActions, srcCluster *tests.TidbCluster } func (bc *BackupCase) Run() error { + //err := bc.operator.StopInsertDataTo(bc.srcCluster) + //if err != nil { + // glog.Errorf("cluster:[%s] stop insert data failed,error: %v", bc.srcCluster.ClusterName, err) + // return err + //} err := bc.operator.DeployAdHocBackup(bc.srcCluster) if err != nil { @@ -114,5 +119,17 @@ func (bc *BackupCase) Run() error { return fmt.Errorf("cluster:[%s] the src cluster data[%d] is not equals des cluster data[%d]", bc.srcCluster.FullName(), srcCount, desCount) } + //err = bc.operator.BeginInsertDataTo(bc.srcCluster) + //if err != nil { + // glog.Errorf("cluster:[%s] begin insert data failed,error: %v", bc.srcCluster.ClusterName, err) + // return err + //} + return nil } + +func (bc *BackupCase) RunOrDie() { + if err := bc.Run(); err != nil { + panic(err) + } +} diff --git a/tests/cluster_info.go b/tests/cluster_info.go index 5b5137c2dd..7470b1cc77 100644 --- a/tests/cluster_info.go +++ b/tests/cluster_info.go @@ -5,7 +5,7 @@ import ( "strconv" ) -func (tc *TidbClusterInfo) set(name string, value string) (string, bool) { +func (tc *TidbClusterConfig) set(name string, value string) (string, bool) { // NOTE: not thread-safe, maybe make info struct immutable if tc.Args == nil { tc.Args = make(map[string]string) @@ -15,43 +15,43 @@ func (tc *TidbClusterInfo) set(name string, value string) (string, bool) { return origVal, ok } -func (tc *TidbClusterInfo) ScalePD(replicas uint) *TidbClusterInfo { +func (tc *TidbClusterConfig) ScalePD(replicas uint) *TidbClusterConfig { tc.set("pd.replicas", strconv.Itoa(int(replicas))) return tc } -func (tc *TidbClusterInfo) ScaleTiKV(replicas uint) *TidbClusterInfo { +func (tc *TidbClusterConfig) ScaleTiKV(replicas uint) *TidbClusterConfig { tc.set("tikv.replicas", strconv.Itoa(int(replicas))) return tc } -func (tc *TidbClusterInfo) ScaleTiDB(replicas uint) *TidbClusterInfo { +func (tc *TidbClusterConfig) ScaleTiDB(replicas uint) *TidbClusterConfig { tc.set("tidb.replicas", strconv.Itoa(int(replicas))) return tc } -func (tc *TidbClusterInfo) UpgradePD(image string) *TidbClusterInfo { +func (tc *TidbClusterConfig) UpgradePD(image string) *TidbClusterConfig { tc.PDImage = image return tc } -func (tc *TidbClusterInfo) UpgradeTiKV(image string) *TidbClusterInfo { +func (tc *TidbClusterConfig) UpgradeTiKV(image string) *TidbClusterConfig { tc.TiKVImage = image return tc } -func (tc *TidbClusterInfo) UpgradeTiDB(image string) *TidbClusterInfo { +func (tc *TidbClusterConfig) UpgradeTiDB(image string) *TidbClusterConfig { tc.TiDBImage = image return tc } -func (tc *TidbClusterInfo) UpgradeAll(tag string) *TidbClusterInfo { +func (tc *TidbClusterConfig) UpgradeAll(tag string) *TidbClusterConfig { return tc. UpgradePD("pingcap/pd:" + tag). UpgradeTiKV("pingcap/tikv:" + tag). UpgradeTiDB("pingcap/tidb:" + tag) } -func (tc *TidbClusterInfo) DSN(dbName string) string { +func (tc *TidbClusterConfig) DSN(dbName string) string { return fmt.Sprintf("root:%s@tcp(%s-tidb.%s:4000)/%s", tc.Password, tc.ClusterName, tc.Namespace, dbName) } diff --git a/tests/cmd/e2e/main.go b/tests/cmd/e2e/main.go index 41c8a34238..adc56f23a1 100644 --- a/tests/cmd/e2e/main.go +++ b/tests/cmd/e2e/main.go @@ -19,14 +19,13 @@ import ( "github.com/golang/glog" "github.com/jinzhu/copier" - "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" + "k8s.io/apiserver/pkg/util/logs" + "github.com/pingcap/tidb-operator/tests" "github.com/pingcap/tidb-operator/tests/backup" + "github.com/pingcap/tidb-operator/tests/pkg/client" "github.com/pingcap/tidb-operator/tests/pkg/workload" "github.com/pingcap/tidb-operator/tests/pkg/workload/ddl" - "k8s.io/apiserver/pkg/util/logs" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" ) func main() { @@ -39,22 +38,11 @@ func main() { glog.Fatalf("failed to parse config: %v", err) } - cfg, err := rest.InClusterConfig() - if err != nil { - glog.Fatalf("failed to get config: %v", err) - } - cli, err := versioned.NewForConfig(cfg) - if err != nil { - glog.Fatalf("failed to create Clientset: %v", err) - } - kubeCli, err := kubernetes.NewForConfig(cfg) - if err != nil { - glog.Fatalf("failed to get kubernetes Clientset: %v", err) - } + cli, kubeCli := client.NewCliOrDie() oa := tests.NewOperatorActions(cli, kubeCli, conf) - operatorInfo := &tests.OperatorInfo{ + operatorInfo := &tests.OperatorConfig{ Namespace: "pingcap", ReleaseName: "operator", Image: conf.OperatorImage, @@ -64,7 +52,7 @@ func main() { LogLevel: "2", } - initTidbVersion, err := conf.GetInitTidbVersion() + initTidbVersion, err := conf.GetTiDBVersion() if err != nil { glog.Fatal(err) } @@ -73,7 +61,7 @@ func main() { name1 := "e2e-cluster1" name2 := "e2e-cluster2" - clusterInfos := []*tests.TidbClusterInfo{ + clusterInfos := []*tests.TidbClusterConfig{ { Namespace: name1, ClusterName: name1, @@ -249,7 +237,7 @@ func main() { // backup and restore backupClusterInfo := clusterInfos[0] - restoreClusterInfo := &tests.TidbClusterInfo{} + restoreClusterInfo := &tests.TidbClusterConfig{} copier.Copy(restoreClusterInfo, backupClusterInfo) restoreClusterInfo.ClusterName = restoreClusterInfo.ClusterName + "-other" restoreClusterInfo.InitSecretName = fmt.Sprintf("%s-set-secret", restoreClusterInfo.ClusterName) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 44d530ed3f..1f1fad29b9 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -14,26 +14,18 @@ package main import ( - "errors" "fmt" - "math/rand" "net/http" _ "net/http/pprof" - "os" "time" + "github.com/golang/glog" "github.com/jinzhu/copier" - "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apiserver/pkg/util/logs" - "k8s.io/client-go/kubernetes" - "github.com/golang/glog" "github.com/pingcap/tidb-operator/tests" "github.com/pingcap/tidb-operator/tests/backup" - "github.com/pingcap/tidb-operator/tests/pkg/workload" - "github.com/pingcap/tidb-operator/tests/pkg/workload/ddl" + "github.com/pingcap/tidb-operator/tests/pkg/client" ) func main() { @@ -44,441 +36,186 @@ func main() { glog.Info(http.ListenAndServe("localhost:6060", nil)) }() - conf := tests.NewConfig() - conf.ParseOrDie() - - // TODO read these args from config - beginTidbVersion := "v2.1.0" - toTidbVersion := "v2.1.4" - operatorTag := "master" - operatorImage := "pingcap/tidb-operator:latest" - - cli, kubeCli, err := tests.CreateKubeClient() - if err != nil { - glog.Fatalf("failed to create kubernetes clientset: %v", err) - } - + conf := tests.ParseConfigOrDie() + cli, kubeCli := client.NewCliOrDie() oa := tests.NewOperatorActions(cli, kubeCli, conf) - operatorInfo := &tests.OperatorInfo{ + tidbVersion := conf.GetTiDBVersionOrDie() + upgardeTiDBVersions := conf.GetUpgradeTidbVersionsOrDie() + + // operator config + operatorCfg := &tests.OperatorConfig{ Namespace: "pingcap", ReleaseName: "operator", - Image: operatorImage, - Tag: operatorTag, - SchedulerImage: "gcr.io/google-containers/hyperkube:v1.12.1", + Image: conf.OperatorImage, + Tag: conf.OperatorTag, + SchedulerImage: "gcr.io/google-containers/hyperkube", LogLevel: "2", } + // TODO remove this // create database and table and insert a column for test backup and restore initSql := `"create database record;use record;create table test(t char(32))"` - clusterInfos := []*tests.TidbClusterInfo{ - { - Namespace: "e2e-cluster1", - ClusterName: "e2e-cluster1", - OperatorTag: operatorTag, - PDImage: fmt.Sprintf("pingcap/pd:%s", beginTidbVersion), - TiKVImage: fmt.Sprintf("pingcap/tikv:%s", beginTidbVersion), - TiDBImage: fmt.Sprintf("pingcap/tidb:%s", beginTidbVersion), - StorageClassName: "local-storage", - Password: "admin", - InitSql: initSql, - UserName: "root", - InitSecretName: "demo-set-secret", - BackupSecretName: "demo-backup-secret", - BackupPVC: "test-backup", - Resources: map[string]string{ - "pd.resources.limits.cpu": "1000m", - "pd.resources.limits.memory": "2Gi", - "pd.resources.requests.cpu": "200m", - "pd.resources.requests.memory": "1Gi", - "tikv.resources.limits.cpu": "2000m", - "tikv.resources.limits.memory": "4Gi", - "tikv.resources.requests.cpu": "1000m", - "tikv.resources.requests.memory": "2Gi", - "tidb.resources.limits.cpu": "2000m", - "tidb.resources.limits.memory": "4Gi", - "tidb.resources.requests.cpu": "500m", - "tidb.resources.requests.memory": "1Gi", - }, - Args: map[string]string{}, - Monitor: true, + // two clusters in diffrent namespaces + clusterName1 := "stability-cluster1" + clusterName2 := "stability-cluster2" + cluster1 := &tests.TidbClusterConfig{ + Namespace: clusterName1, + ClusterName: clusterName1, + OperatorTag: conf.OperatorTag, + PDImage: fmt.Sprintf("pingcap/pd:%s", tidbVersion), + TiKVImage: fmt.Sprintf("pingcap/tikv:%s", tidbVersion), + TiDBImage: fmt.Sprintf("pingcap/tidb:%s", tidbVersion), + StorageClassName: "local-storage", + Password: "admin", + InitSql: initSql, + UserName: "root", + InitSecretName: fmt.Sprintf("%s-set-secret", clusterName1), + BackupSecretName: fmt.Sprintf("%s-backup-secret", clusterName1), + BackupPVC: "backup-pvc", + Resources: map[string]string{ + "pd.resources.limits.cpu": "1000m", + "pd.resources.limits.memory": "2Gi", + "pd.resources.requests.cpu": "200m", + "pd.resources.requests.memory": "1Gi", + "tikv.resources.limits.cpu": "2000m", + "tikv.resources.limits.memory": "4Gi", + "tikv.resources.requests.cpu": "1000m", + "tikv.resources.requests.memory": "2Gi", + "tidb.resources.limits.cpu": "2000m", + "tidb.resources.limits.memory": "4Gi", + "tidb.resources.requests.cpu": "500m", + "tidb.resources.requests.memory": "1Gi", + "monitor.persistent": "true", }, - { - Namespace: "e2e-cluster2", - ClusterName: "e2e-cluster2", - OperatorTag: "master", - PDImage: fmt.Sprintf("pingcap/pd:%s", beginTidbVersion), - TiKVImage: fmt.Sprintf("pingcap/tikv:%s", beginTidbVersion), - TiDBImage: fmt.Sprintf("pingcap/tidb:%s", beginTidbVersion), - StorageClassName: "local-storage", - Password: "admin", - InitSql: initSql, - UserName: "root", - InitSecretName: "demo-set-secret", - BackupSecretName: "demo-backup-secret", - BackupPVC: "test-backup", - Resources: map[string]string{ - "pd.resources.limits.cpu": "1000m", - "pd.resources.limits.memory": "2Gi", - "pd.resources.requests.cpu": "200m", - "pd.resources.requests.memory": "1Gi", - "tikv.resources.limits.cpu": "2000m", - "tikv.resources.limits.memory": "4Gi", - "tikv.resources.requests.cpu": "1000m", - "tikv.resources.requests.memory": "2Gi", - "tidb.resources.limits.cpu": "2000m", - "tidb.resources.limits.memory": "4Gi", - "tidb.resources.requests.cpu": "500m", - "tidb.resources.requests.memory": "1Gi", - }, - Args: map[string]string{}, - Monitor: true, + Args: map[string]string{}, + Monitor: true, + } + cluster2 := &tests.TidbClusterConfig{ + Namespace: clusterName2, + ClusterName: clusterName2, + OperatorTag: conf.OperatorTag, + PDImage: fmt.Sprintf("pingcap/pd:%s", tidbVersion), + TiKVImage: fmt.Sprintf("pingcap/tikv:%s", tidbVersion), + TiDBImage: fmt.Sprintf("pingcap/tidb:%s", tidbVersion), + StorageClassName: "local-storage", + Password: "admin", + InitSql: initSql, + UserName: "root", + InitSecretName: fmt.Sprintf("%s-set-secret", clusterName2), + BackupSecretName: fmt.Sprintf("%s-backup-secret", clusterName2), + BackupPVC: "backup-pvc", + Resources: map[string]string{ + "pd.resources.limits.cpu": "1000m", + "pd.resources.limits.memory": "2Gi", + "pd.resources.requests.cpu": "200m", + "pd.resources.requests.memory": "1Gi", + "tikv.resources.limits.cpu": "2000m", + "tikv.resources.limits.memory": "4Gi", + "tikv.resources.requests.cpu": "1000m", + "tikv.resources.requests.memory": "2Gi", + "tidb.resources.limits.cpu": "2000m", + "tidb.resources.limits.memory": "4Gi", + "tidb.resources.requests.cpu": "500m", + "tidb.resources.requests.memory": "1Gi", + // TODO assert the the monitor's pvc exist and clean it when bootstrapping + "monitor.persistent": "true", }, + Args: map[string]string{}, + Monitor: true, } - defer func() { - oa.DumpAllLogs(operatorInfo, clusterInfos) - }() - - // deploy operator - if err := oa.CleanOperator(operatorInfo); err != nil { - oa.DumpAllLogs(operatorInfo, nil) - glog.Fatal(err) - } - if err = oa.DeployOperator(operatorInfo); err != nil { - oa.DumpAllLogs(operatorInfo, nil) - glog.Fatal(err) - } - - // deploy tidbclusters - for _, clusterInfo := range clusterInfos { - if err = oa.CleanTidbCluster(clusterInfo); err != nil { - glog.Fatal(err) - } - if err = oa.DeployTidbCluster(clusterInfo); err != nil { - glog.Fatal(err) - } - } - - for _, clusterInfo := range clusterInfos { - if err = oa.CheckTidbClusterStatus(clusterInfo); err != nil { - glog.Fatal(err) - } - } - - var workloads []workload.Workload - for _, clusterInfo := range clusterInfos { - workload := ddl.New(clusterInfo.DSN("test"), 1, 1) - workloads = append(workloads, workload) - } - - err = workload.Run(func() error { - - for _, clusterInfo := range clusterInfos { - clusterInfo = clusterInfo.ScaleTiDB(3).ScaleTiKV(5).ScalePD(5) - if err := oa.ScaleTidbCluster(clusterInfo); err != nil { - return err - } - } - for _, clusterInfo := range clusterInfos { - if err := oa.CheckTidbClusterStatus(clusterInfo); err != nil { - return err - } - } - - for _, clusterInfo := range clusterInfos { - clusterInfo = clusterInfo.ScalePD(3) - if err := oa.ScaleTidbCluster(clusterInfo); err != nil { - return err - } - } - for _, clusterInfo := range clusterInfos { - if err := oa.CheckTidbClusterStatus(clusterInfo); err != nil { - return err - } - } - - for _, clusterInfo := range clusterInfos { - clusterInfo = clusterInfo.ScaleTiKV(3) - if err := oa.ScaleTidbCluster(clusterInfo); err != nil { - return err - } - } - for _, clusterInfo := range clusterInfos { - if err := oa.CheckTidbClusterStatus(clusterInfo); err != nil { - return err - } - } + // cluster backup and restore + clusterBackupFrom := cluster1 + clusterRestoreTo := &tests.TidbClusterConfig{} + copier.Copy(clusterRestoreTo, clusterBackupFrom) + clusterRestoreTo.ClusterName = "cluster-restore" - for _, clusterInfo := range clusterInfos { - clusterInfo = clusterInfo.ScaleTiDB(1) - if err := oa.ScaleTidbCluster(clusterInfo); err != nil { - return err - } - } - for _, clusterInfo := range clusterInfos { - if err := oa.CheckTidbClusterStatus(clusterInfo); err != nil { - return err - } - } + allClusters := []*tests.TidbClusterConfig{cluster1, cluster2, clusterRestoreTo} - return nil - }, workloads...) - - if err != nil { - glog.Fatal(err) - } - - for _, clusterInfo := range clusterInfos { - if err = oa.CheckTidbClusterStatus(clusterInfo); err != nil { - glog.Fatal(err) - } - } - - for _, clusterInfo := range clusterInfos { - clusterInfo = clusterInfo.UpgradeAll(toTidbVersion) - if err = oa.UpgradeTidbCluster(clusterInfo); err != nil { - glog.Fatal(err) - } - } - - for _, clusterInfo := range clusterInfos { - if err = oa.CheckTidbClusterStatus(clusterInfo); err != nil { - glog.Fatal(err) - } - } - - // backup and restore - backupClusterInfo := clusterInfos[0] - restoreClusterInfo := &tests.TidbClusterInfo{} - copier.Copy(restoreClusterInfo, backupClusterInfo) - restoreClusterInfo.ClusterName = restoreClusterInfo.ClusterName + "-restore" - - if err = oa.CleanTidbCluster(restoreClusterInfo); err != nil { - glog.Fatal(err) - } - if err = oa.DeployTidbCluster(restoreClusterInfo); err != nil { - glog.Fatal(err) - } - if err = oa.CheckTidbClusterStatus(restoreClusterInfo); err != nil { - glog.Fatal(err) - } - - backupCase := backup.NewBackupCase(oa, backupClusterInfo, restoreClusterInfo) - - if err := backupCase.Run(); err != nil { - glog.Fatal(err) - } - - fa := tests.NewFaultTriggerAction(cli, kubeCli, conf) - if err := testFailover(kubeCli, oa, fa, conf, clusterInfos); err != nil { - glog.Fatal(err) - } -} - -func testFailover( - kubeCli kubernetes.Interface, - oa tests.OperatorActions, - fa tests.FaultTriggerActions, - cfg *tests.Config, - clusters []*tests.TidbClusterInfo, -) error { - var faultPoint time.Time - faultNode, err := getFaultNode(kubeCli) - if err != nil { - return err - } - - physicalNode := getPhysicalNode(faultNode, cfg) - - if physicalNode == "" { - err = errors.New("physical node is empty") - glog.Error(err.Error()) - return err - } - - glog.Infof("try to stop node [%s:%s]", physicalNode, faultNode) - err = wait.Poll(2*time.Second, 10*time.Second, func() (bool, error) { - err = fa.StopNode(physicalNode, faultNode) - if err != nil { - glog.Errorf("failed stop node [%s:%s]: %v", physicalNode, faultNode, err) - return false, nil - } - faultPoint = time.Now() - return true, nil - }) - - if err != nil { - glog.Errorf("test failed when trigger a node [%s:%s] stop,error: %v", physicalNode, faultNode, err) - return err - } - - // defer start node defer func() { - glog.Infof("defer: start node [%s:%s]", physicalNode, faultNode) - if err = fa.StartNode(physicalNode, faultNode); err != nil { - glog.Errorf("failed start node [%s:%s]: %v", physicalNode, faultNode, err) - } + oa.DumpAllLogs(operatorCfg, allClusters) }() - glog.Info("the operator's failover feature should pending some time") - if err = checkPendingFailover(oa, clusters, &faultPoint); err != nil { - glog.Errorf("pending failover failed: %v", err) - return err - } - - glog.Info("the operator's failover feature should start.") - if err = checkFailover(oa, clusters, faultNode); err != nil { - glog.Errorf("check failover failed: %v", err) - return err - } - - glog.Info("sleep 3 minutes ......") + // clean all clusters + for _, cluster := range allClusters { + oa.CleanTidbClusterOrDie(cluster) + } + + // clean and deploy operator + oa.CleanOperatorOrDie(operatorCfg) + oa.DeployOperatorOrDie(operatorCfg) + + // deploy and check cluster1, cluster2 + oa.DeployTidbClusterOrDie(cluster1) + oa.DeployTidbClusterOrDie(cluster2) + oa.CheckTidbClusterStatusOrDie(cluster1) + oa.CheckTidbClusterStatusOrDie(cluster2) + + // TODO insert data + //for _, clusterInfo := range clusterInfos { + // go func() { + // if err = oa.BeginInsertDataTo(clusterInfo); err != nil { + // glog.Fatal(err) + // } + // }() + //} + + // TODO add DDL + //var workloads []workload.Workload + //for _, clusterInfo := range clusterInfos { + // workload := ddl.New(clusterInfo.DSN("test"), 1, 1) + // workloads = append(workloads, workload) + //} + //err = workload.Run(func() error { + //}, workloads...) + + // scale out cluster1 and cluster2 + cluster1.ScaleTiDB(3).ScaleTiKV(5).ScalePD(5) + oa.ScaleTidbClusterOrDie(cluster1) + cluster2.ScaleTiDB(3).ScaleTiKV(5).ScalePD(5) + oa.ScaleTidbClusterOrDie(cluster2) + time.Sleep(30 * time.Second) + oa.CheckTidbClusterStatusOrDie(cluster1) + oa.CheckTidbClusterStatusOrDie(cluster2) + + // scale in cluster1 and cluster2 + cluster1.ScaleTiDB(2).ScaleTiKV(3).ScalePD(3) + oa.ScaleTidbClusterOrDie(cluster1) + cluster2.ScaleTiDB(2).ScaleTiKV(3).ScalePD(3) + oa.ScaleTidbClusterOrDie(cluster2) + time.Sleep(30 * time.Second) + oa.CheckTidbClusterStatusOrDie(cluster1) + oa.CheckTidbClusterStatusOrDie(cluster2) + + // upgrade cluster1 and cluster2 + firstUpgradeVersion := upgardeTiDBVersions[0] + cluster1.UpgradeAll(firstUpgradeVersion) + cluster2.UpgradeAll(firstUpgradeVersion) + oa.UpgradeTidbClusterOrDie(cluster1) + oa.UpgradeTidbClusterOrDie(cluster2) + time.Sleep(30 * time.Second) + oa.CheckTidbClusterStatusOrDie(cluster1) + oa.CheckTidbClusterStatusOrDie(cluster2) + + // deploy and check cluster restore + oa.DeployTidbClusterOrDie(clusterRestoreTo) + oa.CheckTidbClusterStatusOrDie(clusterRestoreTo) + + // restore + backup.NewBackupCase(oa, clusterBackupFrom, clusterRestoreTo).RunOrDie() + + // stop a node and failover automatically + fta := tests.NewFaultTriggerAction(cli, kubeCli, conf) + physicalNode, node, faultTime := fta.StopNodeOrDie() + oa.CheckFailoverPendingOrDie(allClusters, &faultTime) + oa.CheckFailoverOrDie(allClusters, node) time.Sleep(3 * time.Minute) - - glog.Infof("begin to start node %s %s", physicalNode, faultNode) - err = wait.Poll(2*time.Second, 10*time.Second, func() (bool, error) { - err = fa.StartNode(physicalNode, faultNode) - if err != nil { - glog.Errorf("failed start node [%s:%s]: %v", physicalNode, faultNode, err) - return false, nil - } - return true, nil - }) - - glog.Infof("begin to check recover after node [%s:%s] start", physicalNode, faultNode) - if err = checkRecover(oa, clusters); err != nil { - glog.Infof("check recover failed: %v", err) - return err - } - - return nil -} - -func checkRecover(oa tests.OperatorActions, clusters []*tests.TidbClusterInfo) error { - return wait.Poll(5*time.Second, 10*time.Minute, func() (bool, error) { - var passes []bool - for i := range clusters { - err := oa.CheckTidbClusterStatus(clusters[i]) - if err != nil { - return false, err - } - passes = append(passes, true) - } - for _, pass := range passes { - if !pass { - return false, nil - } - } - return true, nil - }) -} - -func checkPendingFailover(oa tests.OperatorActions, clusters []*tests.TidbClusterInfo, faultPoint *time.Time) error { - return wait.Poll(5*time.Second, 10*time.Minute, func() (bool, error) { - var passes []bool - for i := range clusters { - pass, err := oa.PendingFailover(clusters[i], faultPoint) - if err != nil { - return pass, err - } - passes = append(passes, pass) - } - for _, pass := range passes { - if !pass { - return false, nil - } - } - return true, nil - }) -} - -func checkFailover(oa tests.OperatorActions, clusters []*tests.TidbClusterInfo, faultNode string) error { - return wait.Poll(5*time.Second, 25*time.Minute, func() (bool, error) { - var passes []bool - for i := range clusters { - pass, err := oa.CheckFailover(clusters[i], faultNode) - if err != nil { - return pass, err - } - passes = append(passes, pass) - } - for _, pass := range passes { - if !pass { - return false, nil - } - } - return true, nil - }) - -} - -func getMyNodeName() string { - return os.Getenv("MY_NODE_NAME") -} - -func getFaultNode(kubeCli kubernetes.Interface) (string, error) { - var err error - var nodes *v1.NodeList - err = wait.Poll(2*time.Second, 10*time.Second, func() (bool, error) { - nodes, err = kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) - if err != nil { - glog.Errorf("trigger node stop failed when get all nodes, error: %v", err) - return false, nil - } - - return true, nil - }) - - if err != nil { - glog.Errorf("failed to list nodes: %v", err) - return "", err - } - - if len(nodes.Items) <= 1 { - err := errors.New("the number of nodes cannot be less than 1") - glog.Error(err.Error()) - return "", err - } - - myNode := getMyNodeName() - if myNode == "" { - err := errors.New("get own node name is empty") - glog.Error(err.Error()) - return "", err - } - - index := rand.Intn(len(nodes.Items)) - faultNode := nodes.Items[index].Name - if faultNode != myNode { - return faultNode, nil - } - - if index == 0 { - faultNode = nodes.Items[index+1].Name - } else { - faultNode = nodes.Items[index-1].Name - } - - if faultNode == myNode { - err := fmt.Errorf("there are at least two nodes with the name %s", myNode) - glog.Error(err.Error()) - return "", err - } - - return faultNode, nil -} - -func getPhysicalNode(faultNode string, cfg *tests.Config) string { - var physicalNode string - for _, nodes := range cfg.Nodes { - for _, node := range nodes.Nodes { - if node == faultNode { - physicalNode = nodes.PhysicalNode - } - } + fta.StartNodeOrDie(physicalNode, node) + for _, cluster := range allClusters { + oa.CheckTidbClusterStatusOrDie(cluster) } - return physicalNode + glog.Infof("\nFinished.") } diff --git a/tests/config.go b/tests/config.go index 30c49b69d2..ff182258c7 100644 --- a/tests/config.go +++ b/tests/config.go @@ -6,6 +6,7 @@ import ( "io/ioutil" "strings" + "github.com/golang/glog" yaml "gopkg.in/yaml.v2" ) @@ -42,6 +43,16 @@ func NewConfig() *Config { return cfg } +func ParseConfigOrDie() *Config { + cfg := NewConfig() + if err := cfg.Parse(); err != nil { + panic(err) + } + + glog.Infof("using config: %+v", cfg) + return cfg +} + // Parse parses flag definitions from the argument list. func (c *Config) Parse() error { // Parse first to get config file @@ -59,13 +70,6 @@ func (c *Config) Parse() error { return nil } -func (c *Config) ParseOrDie() { - err := c.Parse() - if err != nil { - panic(err) - } -} - func (c *Config) configFromFile(path string) error { data, err := ioutil.ReadFile(path) if err != nil { @@ -79,7 +83,7 @@ func (c *Config) configFromFile(path string) error { return nil } -func (c *Config) GetInitTidbVersion() (string, error) { +func (c *Config) GetTiDBVersion() (string, error) { tidbVersions := strings.Split(c.TidbVersions, ",") if len(tidbVersions) == 0 { return "", fmt.Errorf("init tidb versions can not be nil") @@ -88,8 +92,26 @@ func (c *Config) GetInitTidbVersion() (string, error) { return tidbVersions[0], nil } +func (c *Config) GetTiDBVersionOrDie() string { + v, err := c.GetTiDBVersion() + if err != nil { + panic(err) + } + + return v +} + func (c *Config) GetUpgradeTidbVersions() []string { tidbVersions := strings.Split(c.TidbVersions, ",") return tidbVersions[1:] } + +func (c *Config) GetUpgradeTidbVersionsOrDie() []string { + versions := c.GetUpgradeTidbVersions() + if len(versions) < 1 { + panic("upgrade tidb verions is empty") + } + + return versions +} diff --git a/tests/failover.go b/tests/failover.go new file mode 100644 index 0000000000..1c8740255c --- /dev/null +++ b/tests/failover.go @@ -0,0 +1,269 @@ +package tests + +import ( + "fmt" + "sort" + "strings" + "time" + + _ "github.com/go-sql-driver/mysql" + "github.com/golang/glog" + "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" + "github.com/pingcap/tidb-operator/pkg/label" + "github.com/pingcap/tidb-operator/tests/pkg/client" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" +) + +func (oa *operatorActions) CheckFailoverPending(info *TidbClusterConfig, faultPoint *time.Time) (bool, error) { + tc, err := oa.cli.PingcapV1alpha1().TidbClusters(info.Namespace).Get(info.ClusterName, metav1.GetOptions{}) + if err != nil { + glog.Infof("pending failover,failed to get tidbcluster:[%s], error: %v", info.FullName(), err) + if strings.Contains(err.Error(), "Client.Timeout exceeded while awaiting headers") { + glog.Info("create new client") + newCli, _ := client.NewCliOrDie() + oa.cli = newCli + } + return false, nil + } + deadline := faultPoint.Add(period) + if time.Now().Before(deadline) { + if tc.Status.PD.FailureMembers != nil && len(tc.Status.PD.FailureMembers) > 0 { + err := fmt.Errorf("cluster: [%s] the pd member should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339)) + glog.Errorf(err.Error()) + return false, err + } + if tc.Status.TiKV.FailureStores != nil && len(tc.Status.TiKV.FailureStores) > 0 { + err := fmt.Errorf("cluster: [%s] the tikv store should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339)) + glog.Errorf(err.Error()) + return false, err + } + if tc.Status.TiDB.FailureMembers != nil && len(tc.Status.TiDB.FailureMembers) > 0 { + err := fmt.Errorf("cluster: [%s] the tidb member should be mark failure after %s", info.FullName(), deadline.Format(time.RFC3339)) + glog.Errorf(err.Error()) + return false, err + } + + glog.Infof("cluster: [%s] operator's failover feature is pending", info.FullName()) + return false, nil + } + return true, nil +} + +func (oa *operatorActions) CheckFailoverPendingOrDie(clusters []*TidbClusterConfig, faultPoint *time.Time) { + if err := wait.Poll(1*time.Minute, 30*time.Minute, func() (bool, error) { + var passes []bool + for i := range clusters { + pass, err := oa.CheckFailoverPending(clusters[i], faultPoint) + if err != nil { + return pass, err + } + passes = append(passes, pass) + } + for _, pass := range passes { + if !pass { + return false, nil + } + } + return true, nil + }); err != nil { + panic("failed to check failover pending") + } +} + +func (oa *operatorActions) CheckFailover(info *TidbClusterConfig, node string) (bool, error) { + selector, err := label.New().Instance(info.ClusterName).Selector() + if err != nil { + glog.Errorf("cluster:[%s] create selector failed, error:%v", info.FullName(), err) + return false, nil + } + pods, err := oa.kubeCli.CoreV1().Pods(info.Namespace).List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + glog.Errorf("cluster:[%s] query pods failed, error:%v", info.FullName(), err) + return false, nil + } + + affectedPods := map[string]*corev1.Pod{} + for i, pod := range pods.Items { + if pod.Spec.NodeName == node { + affectedPods[pod.Name] = &pods.Items[i] + } + } + if len(affectedPods) == 0 { + glog.Infof("the cluster:[%s] can not be affected by node:[%s]", info.FullName(), node) + return true, nil + } + + tc, err := oa.cli.PingcapV1alpha1().TidbClusters(info.Namespace).Get(info.ClusterName, metav1.GetOptions{}) + if err != nil { + glog.Errorf("query tidbcluster: [%s] failed, error: %v", info.FullName(), err) + return false, nil + } + + for _, affectedPod := range affectedPods { + switch affectedPod.Labels[label.ComponentLabelKey] { + case label.PDLabelVal: + if !oa.pdFailover(affectedPod, tc) { + return false, nil + } + case label.TiKVLabelVal: + if !oa.tikvFailover(affectedPod, tc) { + return false, nil + } + case label.TiDBLabelVal: + if !oa.tidbFailover(affectedPod, tc) { + return false, nil + } + } + } + + glog.Infof("cluster: [%s]'s failover feature has complete", info.FullName()) + return true, nil +} + +func (oa *operatorActions) CheckFailoverOrDie(clusters []*TidbClusterConfig, faultNode string) { + if err := wait.Poll(1*time.Minute, 30*time.Minute, func() (bool, error) { + var passes []bool + for i := range clusters { + pass, err := oa.CheckFailover(clusters[i], faultNode) + if err != nil { + return pass, err + } + passes = append(passes, pass) + } + for _, pass := range passes { + if !pass { + return false, nil + } + } + return true, nil + }); err != nil { + panic("failed to check failover") + } +} + +func (oa *operatorActions) pdFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool { + failure := false + for _, failureMember := range tc.Status.PD.FailureMembers { + if failureMember.PodName == pod.GetName() { + failure = true + break + } + } + if !failure { + glog.Infof("tidbCluster:[%s/%s]'s member:[%s] have not become failuremember", tc.Namespace, tc.Name, pod.Name) + return false + } + + for _, member := range tc.Status.PD.Members { + if member.Name == pod.GetName() { + glog.Infof("tidbCluster:[%s/%s]'s status.members still have pd member:[%s]", tc.Namespace, tc.Name, pod.Name) + return false + } + } + + if tc.Status.PD.Synced && len(tc.Status.PD.Members) == int(tc.Spec.PD.Replicas) { + return true + } + + glog.Infof("cluster: [%s/%s] pd:[%s] failover still not complete", tc.Namespace, tc.Name, pod.GetName()) + + return false +} + +func (oa *operatorActions) tikvFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool { + failure := false + for _, failureStore := range tc.Status.TiKV.FailureStores { + if failureStore.PodName == pod.GetName() { + failure = true + break + } + } + if !failure { + glog.Infof("tidbCluster:[%s/%s]'s store pod:[%s] have not become failuremember", tc.Namespace, tc.Name, pod.Name) + return false + } + + healthCount := 0 + for _, store := range tc.Status.TiKV.Stores { + if store.State == v1alpha1.TiKVStateUp { + healthCount++ + } + } + if tc.Status.TiKV.Synced && healthCount == int(tc.Spec.TiKV.Replicas) { + return true + } + + glog.Infof("cluster: [%s/%s] tikv:[%s] failover still not complete", tc.Namespace, tc.Name, pod.GetName()) + return false +} + +func (oa *operatorActions) tidbFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool { + failure := false + for _, failureMember := range tc.Status.TiDB.FailureMembers { + if failureMember.PodName == pod.GetName() { + glog.Infof("tidbCluster:[%s/%s]'s store pod:[%s] have not become failuremember", tc.Namespace, tc.Name, pod.Name) + failure = true + break + } + } + if !failure { + return false + } + + healthCount := 0 + for _, member := range tc.Status.TiDB.Members { + if member.Health { + healthCount++ + } + } + + if healthCount == int(tc.Spec.TiDB.Replicas) { + return true + } + glog.Infof("cluster: [%s/%s] tidb:[%s] failover still not complete", tc.Namespace, tc.Name, pod.GetName()) + return false +} + +func (oa *operatorActions) GetPodUIDMap(info *TidbClusterConfig) (map[string]types.UID, error) { + result := map[string]types.UID{} + + selector, err := label.New().Instance(info.ClusterName).Selector() + if err != nil { + return nil, err + } + pods, err := oa.kubeCli.CoreV1().Pods(info.Namespace).List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return nil, err + } + for _, pod := range pods.Items { + result[pod.GetName()] = pod.GetUID() + } + + return result, nil +} + +func (oa *operatorActions) GetNodeMap(info *TidbClusterConfig, component string) (map[string][]string, error) { + nodeMap := make(map[string][]string) + selector := label.New().Instance(info.ClusterName).Component(component).Labels() + podList, err := oa.kubeCli.CoreV1().Pods(info.Namespace).List(metav1.ListOptions{ + LabelSelector: labels.SelectorFromSet(selector).String(), + }) + if err != nil { + return nil, err + } + + for _, pod := range podList.Items { + nodeName := pod.Spec.NodeName + if len(nodeMap[nodeName]) == 0 { + nodeMap[nodeName] = make([]string, 0) + } + nodeMap[nodeName] = append(nodeMap[nodeName], pod.GetName()) + sort.Strings(nodeMap[nodeName]) + } + + return nodeMap, nil +} diff --git a/tests/fault.go b/tests/fault.go index dcffea21cf..4f8e75bea9 100644 --- a/tests/fault.go +++ b/tests/fault.go @@ -2,12 +2,18 @@ package tests import ( "fmt" + "math/rand" + "os" + "time" "github.com/golang/glog" "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" "github.com/pingcap/tidb-operator/pkg/controller" "github.com/pingcap/tidb-operator/tests/pkg/fault-trigger/client" "github.com/pingcap/tidb-operator/tests/pkg/fault-trigger/manager" + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" ) @@ -17,8 +23,10 @@ const ( ) type FaultTriggerActions interface { - StopNode(physicalNode string, node string) error + StopNode() (string, string, time.Time, error) + StopNodeOrDie() (string, string, time.Time) StartNode(physicalNode string, node string) error + StartNodeOrDie(physicalNode string, node string) StopETCD(nodes ...string) error StartETCD(nodes ...string) error StopKubelet(node string) error @@ -54,23 +62,44 @@ type faultTriggerActions struct { cfg *Config } -func (fa *faultTriggerActions) StopNode(physicalNode string, node string) error { +func (fa *faultTriggerActions) StopNode() (string, string, time.Time, error) { + now := time.Now() + node, err := getFaultNode(fa.kubeCli) + if err != nil { + return "", "", now, err + } + glog.Infof("selecting %s as the node to failover", node) + + physicalNode := getPhysicalNode(node, fa.cfg) + + if physicalNode == "" { + return "", "", now, fmt.Errorf("physical node is empty") + } + faultCli := client.NewClient(client.Config{ Addr: fa.genFaultTriggerAddr(physicalNode), }) - err := faultCli.StopVM(&manager.VM{ + if err := faultCli.StopVM(&manager.VM{ IP: node, - }) - - if err != nil { + }); err != nil { glog.Errorf("failed to stop node %s on physical node: %s: %v", node, physicalNode, err) - return err + return "", "", now, err } glog.Infof("node %s on physical node %s is stopped", node, physicalNode) + return physicalNode, node, now, nil +} - return nil +func (fa *faultTriggerActions) StopNodeOrDie() (string, string, time.Time) { + var pn string + var n string + var err error + var now time.Time + if pn, n, now, err = fa.StopNode(); err != nil { + panic(err) + } + return pn, n, now } func (fa *faultTriggerActions) StartNode(physicalNode string, node string) error { @@ -78,11 +107,22 @@ func (fa *faultTriggerActions) StartNode(physicalNode string, node string) error Addr: fa.genFaultTriggerAddr(physicalNode), }) - err := faultCli.StartVM(&manager.VM{ - IP: node, - }) - + vms, err := faultCli.ListVMs() if err != nil { + return err + } + + glog.Infof("%+v", vms) + + for _, vm := range vms { + if vm.IP == node && vm.Status == "running" { + return nil + } + } + + if err := faultCli.StartVM(&manager.VM{ + IP: node, + }); err != nil { glog.Errorf("failed to start node %s on physical node %s: %v", node, physicalNode, err) return err } @@ -92,6 +132,12 @@ func (fa *faultTriggerActions) StartNode(physicalNode string, node string) error return nil } +func (fa *faultTriggerActions) StartNodeOrDie(physicalNode string, node string) { + if err := fa.StartNode(physicalNode, node); err != nil { + panic(err) + } +} + // StopETCD stops the etcd service. // If the `nodes` is empty, StopEtcd will stop all etcd service. func (fa *faultTriggerActions) StopETCD(nodes ...string) error { @@ -234,3 +280,68 @@ func (fa *faultTriggerActions) serviceAction(node string, serverName string, act func (fa *faultTriggerActions) genFaultTriggerAddr(node string) string { return fmt.Sprintf("%s:%d", node, fa.cfg.FaultTriggerPort) } + +func getMyNodeName() string { + return os.Getenv("MY_NODE_NAME") +} + +func getFaultNode(kubeCli kubernetes.Interface) (string, error) { + var err error + var nodes *v1.NodeList + err = wait.Poll(2*time.Second, 10*time.Second, func() (bool, error) { + nodes, err = kubeCli.CoreV1().Nodes().List(metav1.ListOptions{}) + if err != nil { + glog.Errorf("trigger node stop failed when get all nodes, error: %v", err) + return false, nil + } + + return true, nil + }) + + if err != nil { + glog.Errorf("failed to list nodes: %v", err) + return "", err + } + + if len(nodes.Items) <= 1 { + return "", fmt.Errorf("the number of nodes cannot be less than 1") + } + + myNode := getMyNodeName() + if myNode == "" { + return "", fmt.Errorf("get own node name is empty") + } + + index := rand.Intn(len(nodes.Items)) + faultNode := nodes.Items[index].Name + if faultNode != myNode { + return faultNode, nil + } + + if index == 0 { + faultNode = nodes.Items[index+1].Name + } else { + faultNode = nodes.Items[index-1].Name + } + + if faultNode == myNode { + err := fmt.Errorf("there are at least two nodes with the name %s", myNode) + glog.Error(err.Error()) + return "", err + } + + return faultNode, nil +} + +func getPhysicalNode(faultNode string, cfg *Config) string { + var physicalNode string + for _, nodes := range cfg.Nodes { + for _, node := range nodes.Nodes { + if node == faultNode { + physicalNode = nodes.PhysicalNode + } + } + } + + return physicalNode +} diff --git a/tests/log_dump.go b/tests/log_dump.go index f5dc11af02..02c5474539 100644 --- a/tests/log_dump.go +++ b/tests/log_dump.go @@ -11,7 +11,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func (oa *operatorActions) DumpAllLogs(operatorInfo *OperatorInfo, testClusters []*TidbClusterInfo) error { +func (oa *operatorActions) DumpAllLogs(operatorInfo *OperatorConfig, testClusters []*TidbClusterConfig) error { logPath := fmt.Sprintf("/%s/%s", oa.cfg.LogDir, "operator-stability") if _, err := os.Stat(logPath); os.IsNotExist(err) { err = os.MkdirAll(logPath, os.ModePerm) diff --git a/tests/manifests/stability/stability-configmap.yaml b/tests/manifests/stability/stability-configmap.yaml index 9b7a201e3d..e84efd3d7e 100644 --- a/tests/manifests/stability/stability-configmap.yaml +++ b/tests/manifests/stability/stability-configmap.yaml @@ -6,25 +6,25 @@ metadata: data: config: |- nodes: - - physical_node: 172.16.4.39 + - physical_node: 172.16.4.38 nodes: - - 172.16.4.171 - - 172.16.4.172 - - 172.16.4.173 - - physical_node: 172.16.4.40 + - 172.16.4.177 + - 172.16.4.178 + - 172.16.4.179 + - physical_node: 172.16.4.37 nodes: - - 172.16.4.174 - - 172.16.4.175 - - 172.16.4.176 + - 172.16.4.180 + - 172.16.4.181 + - 172.16.4.182 etcds: - - physical_node: 172.16.4.39 + - physical_node: 172.16.4.37 nodes: - - 172.16.4.171 - - 172.16.4.172 - - 172.16.4.173 + - 172.16.4.180 + - 172.16.4.181 + - 172.16.4.182 apiservers: - - physical_node: 172.16.4.39 + - physical_node: 172.16.4.37 nodes: - - 172.16.4.171 - - 172.16.4.172 - - 172.16.4.173 + - 172.16.4.180 + - 172.16.4.181 + - 172.16.4.182 diff --git a/tests/manifests/stability/stability.yaml b/tests/manifests/stability/stability.yaml index 86e3664820..6e9cf6248f 100644 --- a/tests/manifests/stability/stability.yaml +++ b/tests/manifests/stability/stability.yaml @@ -29,8 +29,8 @@ spec: - name: tidb-operator-stability image: "" imagePullPolicy: Always - command: ["sh", "-c", "/usr/local/bin/stability-test"] - args: + command: + - /usr/local/bin/stability-test - --config=/etc/tidb-operator-stability/config.yaml volumeMounts: - mountPath: /logDir @@ -38,6 +38,11 @@ spec: - name: config readOnly: true mountPath: /etc/tidb-operator-stability + env: + - name: MY_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName volumes: - name: logdir hostPath: diff --git a/tests/pkg/blockwriter/blockwriter.go b/tests/pkg/blockwriter/blockwriter.go index 8434f151b0..f7604dc9b4 100644 --- a/tests/pkg/blockwriter/blockwriter.go +++ b/tests/pkg/blockwriter/blockwriter.go @@ -131,7 +131,7 @@ func (c *BlockWriterCase) generateQuery(ctx context.Context, queryChan chan []st func (bw *blockWriter) batchExecute(db *sql.DB, query string) error { _, err := db.Exec(query) if err != nil { - glog.Errorf("[block_writer] exec sql [%s] failed, err: %v", query, err) + glog.V(4).Infof("[block_writer] exec sql [%s] failed, err: %v", query, err) return err } @@ -158,7 +158,9 @@ func (bw *blockWriter) run(ctx context.Context, db *sql.DB, queryChan chan []str return default: if err := bw.batchExecute(db, query); err != nil { - glog.Fatal(err) + glog.Error(err) + time.Sleep(5 * time.Second) + continue } } } @@ -210,7 +212,7 @@ func (c *BlockWriterCase) Start(db *sql.DB) error { if !atomic.CompareAndSwapUint32(&c.isRunning, 0, 1) { err := fmt.Errorf("[%s] is running, you can't start it again", c) glog.Error(err) - return err + return nil } defer func() { diff --git a/tests/pkg/client/client.go b/tests/pkg/client/client.go new file mode 100644 index 0000000000..ec2caa63c7 --- /dev/null +++ b/tests/pkg/client/client.go @@ -0,0 +1,29 @@ +package client + +import ( + "time" + + "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +func NewCliOrDie() (versioned.Interface, kubernetes.Interface) { + cfg, err := rest.InClusterConfig() + if err != nil { + panic(err) + } + + cfg.Timeout = 30 * time.Second + cli, err := versioned.NewForConfig(cfg) + if err != nil { + panic(err) + } + + kubeCli, err := kubernetes.NewForConfig(cfg) + if err != nil { + panic(err) + } + + return cli, kubeCli +} diff --git a/tests/pkg/fault-trigger/manager/types.go b/tests/pkg/fault-trigger/manager/types.go index b2b7e7f1a0..5963c28df2 100644 --- a/tests/pkg/fault-trigger/manager/types.go +++ b/tests/pkg/fault-trigger/manager/types.go @@ -17,11 +17,12 @@ import "errors" // VM defines the descriptive information of a virtual machine type VM struct { - Host string `json:"host"` - Port int64 `json:"port"` - Name string `json:"name"` - IP string `json:"ip"` - Role []string `json:"role"` + Host string `json:"host"` + Port int64 `json:"port"` + Name string `json:"name"` + IP string `json:"ip"` + Role []string `json:"role"` + Status string `json:"status"` } func (v *VM) Verify() error { diff --git a/tests/pkg/fault-trigger/manager/vm.go b/tests/pkg/fault-trigger/manager/vm.go index 14d32d1344..68cd9ca943 100644 --- a/tests/pkg/fault-trigger/manager/vm.go +++ b/tests/pkg/fault-trigger/manager/vm.go @@ -201,7 +201,8 @@ func parserVMs(data string) []*VM { continue } vm := &VM{ - Name: fields[1], + Name: fields[1], + Status: fields[2], } vms = append(vms, vm) } diff --git a/tests/pkg/fault-trigger/manager/vm_test.go b/tests/pkg/fault-trigger/manager/vm_test.go index 66d8f772aa..0dd64b337c 100644 --- a/tests/pkg/fault-trigger/manager/vm_test.go +++ b/tests/pkg/fault-trigger/manager/vm_test.go @@ -50,7 +50,7 @@ func TestParseVMs(t *testing.T) { data := ` Id Name State ---------------------------------------------------- - 6 vm2 running + 6 vm2 shut off 11 vm3 running 12 vm1 running - vm-template shut off @@ -59,13 +59,16 @@ func TestParseVMs(t *testing.T) { var expectedVMs []*VM expectedVMs = append(expectedVMs, &VM{ - Name: "vm2", + Name: "vm2", + Status: "running", }) expectedVMs = append(expectedVMs, &VM{ - Name: "vm3", + Name: "vm3", + Status: "running", }) expectedVMs = append(expectedVMs, &VM{ - Name: "vm1", + Name: "vm1", + Status: "shut off", }) g.Expect(vms).To(Equal(expectedVMs)) } diff --git a/tests/pkg/workload/ddl/internal/ddl.go b/tests/pkg/workload/ddl/internal/ddl.go index cdcb3ee38d..607bfc72c2 100644 --- a/tests/pkg/workload/ddl/internal/ddl.go +++ b/tests/pkg/workload/ddl/internal/ddl.go @@ -82,9 +82,9 @@ func (c *DDLCase) Execute(ctx context.Context, dbss [][]*sql.DB, exeDDLFunc Exec } } - glog.Infof("[%s] start to test...", c) + glog.V(4).Infof("[%s] start to test...", c) defer func() { - glog.Infof("[%s] test end...", c) + glog.V(4).Infof("[%s] test end...", c) }() var wg sync.WaitGroup for i := 0; i < c.cfg.Concurrency; i++ { @@ -372,8 +372,8 @@ func (c *testCase) execute(executeDDL ExecuteDDLFunc, exeDMLFunc ExecuteDMLFunc) return errors.Trace(err) } - glog.Infof("[ddl] [instance %d] Round completed", c.caseIndex) - glog.Infof("[ddl] [instance %d] Executing post round operations...", c.caseIndex) + glog.V(4).Infof("[ddl] [instance %d] Round completed", c.caseIndex) + glog.V(4).Infof("[ddl] [instance %d] Executing post round operations...", c.caseIndex) if !c.cfg.MySQLCompatible { err := c.executeAdminCheck() @@ -421,7 +421,7 @@ func (c *testCase) executeVerifyIntegrity() error { // execute opStart := time.Now() rows, err := db.Query(sql) - glog.Infof("[ddl] [instance %d] %s, elapsed time:%v, got table time:%v, selectID:%v", c.caseIndex, sql, time.Since(opStart).Seconds(), gotTableTime, uniqID) + glog.V(4).Infof("[ddl] [instance %d] %s, elapsed time:%v, got table time:%v, selectID:%v", c.caseIndex, sql, time.Since(opStart).Seconds(), gotTableTime, uniqID) if err == nil { defer rows.Close() } @@ -446,7 +446,7 @@ func (c *testCase) executeVerifyIntegrity() error { return errors.Trace(err) } - glog.Infof("[ddl] [instance %d] rows.Columns():%v, len(cols):%v, selectID:%v", c.caseIndex, cols, len(cols), uniqID) + glog.V(4).Infof("[ddl] [instance %d] rows.Columns():%v, len(cols):%v, selectID:%v", c.caseIndex, cols, len(cols), uniqID) // See https://stackoverflow.com/questions/14477941/read-select-columns-into-string-in-go rawResult := make([][]byte, len(cols)) @@ -514,14 +514,14 @@ func (c *testCase) executeVerifyIntegrity() error { if !ok { c.stopTest() err = fmt.Errorf("Expecting row %s in table `%s` but not found, sql: %s, selectID:%v, checkTime:%v, rowErr:%v, actualRowsMap:%#v\n%s", rowString, table.name, sql, uniqID, checkTime, rows.Err(), actualRowsMap, table.debugPrintToString()) - glog.Infof("err: %v", err) + glog.V(4).Infof("err: %v", err) return errors.Trace(err) } actualRowsMap[rowString]-- if actualRowsMap[rowString] < 0 { c.stopTest() err = fmt.Errorf("Expecting row %s in table `%s` but not found, sql: %s, selectID:%v, checkTime:%v, rowErr:%v, actualRowsMap:%#v\n%s", rowString, table.name, sql, uniqID, checkTime, rows.Err(), actualRowsMap, table.debugPrintToString()) - glog.Infof("err: %v", err) + glog.V(4).Infof("err: %v", err) return errors.Trace(err) } } @@ -529,7 +529,7 @@ func (c *testCase) executeVerifyIntegrity() error { if occurs > 0 { c.stopTest() err = fmt.Errorf("Unexpected row %s in table `%s`, sql: %s, selectID:%v, checkTime:%v, rowErr:%v, actualRowsMap:%#v\n%s", rowString, table.name, sql, uniqID, checkTime, rows.Err(), actualRowsMap, table.debugPrintToString()) - glog.Infof("err: %v", err) + glog.V(4).Infof("err: %v", err) return errors.Trace(err) } } @@ -572,7 +572,7 @@ func (c *testCase) executeAdminCheck() error { dbIdx := rand.Intn(len(c.dbs)) db := c.dbs[dbIdx] // execute - glog.Infof("[ddl] [instance %d] %s", c.caseIndex, sql) + glog.V(4).Infof("[ddl] [instance %d] %s", c.caseIndex, sql) _, err := db.Exec(sql) if err != nil { if ignore_error(err) { diff --git a/tests/pkg/workload/ddl/internal/ddl_ops.go b/tests/pkg/workload/ddl/internal/ddl_ops.go index 5ef4edc866..6f3a33852a 100644 --- a/tests/pkg/workload/ddl/internal/ddl_ops.go +++ b/tests/pkg/workload/ddl/internal/ddl_ops.go @@ -156,7 +156,7 @@ func (c *testCase) execParaDDLSQL(taskCh chan *ddlJobTask, num int) error { opStart := time.Now() db := c.dbs[0] _, err := db.Exec(task.sql) - glog.Infof("[ddl] [instance %d] TiDB execute %s , err %v, table_id %s, elapsed time:%v", c.caseIndex, task.sql, err, task.tblInfo.id, time.Since(opStart).Seconds()) + glog.V(4).Infof("[ddl] [instance %d] TiDB execute %s , err %v, table_id %s, elapsed time:%v", c.caseIndex, task.sql, err, task.tblInfo.id, time.Since(opStart).Seconds()) task.err = err }(task) } @@ -168,7 +168,7 @@ func (c *testCase) execParaDDLSQL(taskCh chan *ddlJobTask, num int) error { } for _, task := range SortTasks { err := c.updateTableInfo(task) - glog.Infof("[ddl] [instance %d] local execute %s, err %v , table_id %s, ddlID %v", c.caseIndex, task.sql, err, task.tblInfo.id, task.ddlID) + glog.V(4).Infof("[ddl] [instance %d] local execute %s, err %v , table_id %s, ddlID %v", c.caseIndex, task.sql, err, task.tblInfo.id, task.ddlID) if err == nil && task.err != nil || err != nil && task.err == nil { return fmt.Errorf("Error when executing SQL: %s\n, local err: %#v, remote tidb err: %#v\n%s\n", task.sql, err, task.err, task.tblInfo.debugPrintToString()) } @@ -185,7 +185,7 @@ func (c *testCase) execSerialDDLSQL(taskCh chan *ddlJobTask) error { db := c.dbs[0] opStart := time.Now() _, err := db.Exec(task.sql) - glog.Infof("[ddl] [instance %d] %s, elapsed time:%v", c.caseIndex, task.sql, time.Since(opStart).Seconds()) + glog.V(4).Infof("[ddl] [instance %d] %s, elapsed time:%v", c.caseIndex, task.sql, time.Since(opStart).Seconds()) if err != nil { return fmt.Errorf("Error when executing SQL: %s\n remote tidb Err: %#v\n%s\n", task.sql, err, task.tblInfo.debugPrintToString()) } @@ -702,7 +702,7 @@ func (c *testCase) getHistoryDDLJobs(db *sql.DB, tasks []*ddlJobTask) ([]*ddlJob // execute opStart := time.Now() rows, err := db.Query(sql) - glog.Infof("%s, elapsed time:%v", sql, time.Since(opStart).Seconds()) + glog.V(4).Infof("%s, elapsed time:%v", sql, time.Since(opStart).Seconds()) if err != nil { return nil, err } diff --git a/tests/pkg/workload/ddl/internal/dml_ops.go b/tests/pkg/workload/ddl/internal/dml_ops.go index 2f03fbf30c..742d80cc6e 100644 --- a/tests/pkg/workload/ddl/internal/dml_ops.go +++ b/tests/pkg/workload/ddl/internal/dml_ops.go @@ -84,7 +84,7 @@ func checkConflict(task *dmlJobTask) error { func (c *testCase) sendDMLRequest(ctx context.Context, conn *sql.Conn, task *dmlJobTask) error { _, err := conn.ExecContext(ctx, task.sql) task.err = err - glog.Infof("[ddl] [instance %d] %s, err: %v", c.caseIndex, task.sql, err) + glog.V(4).Infof("[ddl] [instance %d] %s, err: %v", c.caseIndex, task.sql, err) if err != nil { err2 := checkConflict(task) if err2 != nil { @@ -148,7 +148,7 @@ func (c *testCase) execDMLInTransactionSQL(taskCh chan *dmlJobTask) error { defer conn.Close() _, err = conn.ExecContext(ctx, "begin") - glog.Infof("[ddl] [instance %d] begin error: %v", c.caseIndex, err) + glog.V(4).Infof("[ddl] [instance %d] begin error: %v", c.caseIndex, err) if err != nil { return errors.Annotatef(err, "Error when executing SQL: %s", "begin") } @@ -164,7 +164,7 @@ func (c *testCase) execDMLInTransactionSQL(taskCh chan *dmlJobTask) error { } _, err = conn.ExecContext(ctx, "commit") - glog.Infof("[ddl] [instance %d] commit error: %v", c.caseIndex, err) + glog.V(4).Infof("[ddl] [instance %d] commit error: %v", c.caseIndex, err) if err != nil { if ignore_error(err) { return nil diff --git a/tests/pkg/workload/ddl/internal/run.go b/tests/pkg/workload/ddl/internal/run.go index 182b4b99d6..a55160cbb2 100644 --- a/tests/pkg/workload/ddl/internal/run.go +++ b/tests/pkg/workload/ddl/internal/run.go @@ -44,12 +44,12 @@ func openDB(dsn string, maxIdleConns int) (*sql.DB, error) { } db.SetMaxIdleConns(maxIdleConns) - glog.Info("DB opens successfully") + glog.V(4).Info("DB opens successfully") return db, nil } func Run(ctx context.Context, dbDSN string, concurrency int, tablesToCreate int, mysqlCompatible bool, testTp DDLTestType) { - glog.Infof("[ddl] Enable transaction test is: %v", enableTransactionTest) + glog.V(4).Infof("[ddl] Enable transaction test is: %v", enableTransactionTest) dbss := make([][]*sql.DB, 0, concurrency) for i := 0; i < concurrency; i++ { diff --git a/tests/util.go b/tests/util.go deleted file mode 100644 index ba3e11806b..0000000000 --- a/tests/util.go +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2018 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// See the License for the specific language governing permissions and -// limitations under the License.package spec - -package tests - -import ( - "time" - - "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" -) - -func CreateKubeClient() (versioned.Interface, kubernetes.Interface, error) { - cfg, err := rest.InClusterConfig() - if err != nil { - return nil, nil, err - } - cfg.Timeout = 30 * time.Second - operatorCli, err := versioned.NewForConfig(cfg) - if err != nil { - return nil, nil, err - } - - kubeCli, err := kubernetes.NewForConfig(cfg) - if err != nil { - return nil, nil, err - } - - return operatorCli, kubeCli, nil -} From 5af1c2ad072cd6b242c24f2451299406266dd884 Mon Sep 17 00:00:00 2001 From: weekface Date: Wed, 3 Apr 2019 15:05:02 +0800 Subject: [PATCH 2/4] make go vet happy --- tests/cmd/stability/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 1f1fad29b9..b639d60c07 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -57,7 +57,7 @@ func main() { // create database and table and insert a column for test backup and restore initSql := `"create database record;use record;create table test(t char(32))"` - // two clusters in diffrent namespaces + // two clusters in different namespaces clusterName1 := "stability-cluster1" clusterName2 := "stability-cluster2" cluster1 := &tests.TidbClusterConfig{ From 85ee2214b76563c53beb29726f79f803580afa46 Mon Sep 17 00:00:00 2001 From: weekface Date: Wed, 3 Apr 2019 17:01:08 +0800 Subject: [PATCH 3/4] address comment --- tests/actions.go | 63 +++++++++++-------------------------- tests/cmd/stability/main.go | 13 +++----- tests/failover.go | 40 +++++++++++++++++++++++ 3 files changed, 63 insertions(+), 53 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 11bd9ac478..33082ccc3f 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -29,12 +29,6 @@ import ( "github.com/golang/glog" pingcapErrors "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" - "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" - "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" - "github.com/pingcap/tidb-operator/pkg/controller" - "github.com/pingcap/tidb-operator/pkg/label" - "github.com/pingcap/tidb-operator/tests/pkg/blockwriter" - "github.com/pingcap/tidb-operator/tests/pkg/util" "k8s.io/api/apps/v1beta1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" @@ -44,6 +38,13 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" + + "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" + "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" + "github.com/pingcap/tidb-operator/pkg/controller" + "github.com/pingcap/tidb-operator/pkg/label" + "github.com/pingcap/tidb-operator/tests/pkg/blockwriter" + "github.com/pingcap/tidb-operator/tests/pkg/util" ) const ( @@ -111,6 +112,8 @@ type OperatorActions interface { CheckFailoverPendingOrDie(clusters []*TidbClusterConfig, faultPoint *time.Time) CheckFailover(info *TidbClusterConfig, faultNode string) (bool, error) CheckFailoverOrDie(clusters []*TidbClusterConfig, faultNode string) + CheckRecover(cluster *TidbClusterConfig) (bool, error) + CheckRecoverOrDie(clusters []*TidbClusterConfig) } type operatorActions struct { @@ -293,9 +296,6 @@ func (oa *operatorActions) UpgradeOperator(info *OperatorConfig) error { func (oa *operatorActions) DeployTidbCluster(info *TidbClusterConfig) error { glog.Infof("begin to deploy tidb cluster cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - defer func() { - glog.Infof("deploy tidb cluster end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - }() namespace := &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ @@ -337,10 +337,8 @@ func (oa *operatorActions) DeployTidbClusterOrDie(info *TidbClusterConfig) { } func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { - glog.Infof("begin to clean tidb cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - defer func() { - glog.Infof("clean tidb cluster end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - }() + glog.Infof("cleaning tidbcluster %s/%s", info.Namespace, info.ClusterName) + charts := []string{ info.ClusterName, fmt.Sprintf("%s-backup", info.ClusterName), @@ -356,7 +354,6 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { } err := oa.kubeCli.CoreV1().Pods(info.Namespace).Delete(getBackupDirPodName, &metav1.DeleteOptions{}) - if err != nil && !errors.IsNotFound(err) { return fmt.Errorf("failed to delete dir pod %v", err) } @@ -380,7 +377,7 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { patchPVCmd := fmt.Sprintf(`kubectl get pv -l %s=%s,%s=%s --output=name | xargs -I {} \ kubectl patch {} -p '{"spec":{"persistentVolumeReclaimPolicy":"Delete"}}'`, label.NamespaceLabelKey, info.Namespace, label.InstanceLabelKey, info.ClusterName) - glog.Info(patchPVCmd) + glog.V(4).Info(patchPVCmd) if res, err := exec.Command("/bin/sh", "-c", patchPVCmd).CombinedOutput(); err != nil { return fmt.Errorf("failed to patch pv: %v, %s", err, string(res)) } @@ -388,14 +385,14 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { pollFn := func() (bool, error) { if res, err := exec.Command("kubectl", "get", "po", "--output=name", "-n", info.Namespace, "-l", setStr). CombinedOutput(); err != nil || len(res) != 0 { - glog.Infof("waiting for tidbcluster: %s/%s pods deleting, %v, [%s]", + glog.V(4).Infof("waiting for tidbcluster: %s/%s pods deleting, %v, [%s]", info.Namespace, info.ClusterName, err, string(res)) return false, nil } pvCmd := fmt.Sprintf("kubectl get pv -l %s=%s,%s=%s 2>/dev/null|grep Released", label.NamespaceLabelKey, info.Namespace, label.InstanceLabelKey, info.ClusterName) - glog.Info(pvCmd) + glog.V(4).Info(pvCmd) if res, err := exec.Command("/bin/sh", "-c", pvCmd). CombinedOutput(); len(res) == 0 { } else if err != nil { @@ -416,9 +413,7 @@ func (oa *operatorActions) CleanTidbClusterOrDie(info *TidbClusterConfig) { func (oa *operatorActions) CheckTidbClusterStatus(info *TidbClusterConfig) error { glog.Infof("begin to check tidb cluster cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - defer func() { - glog.Infof("check tidb cluster end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - }() + ns := info.Namespace tcName := info.ClusterName if err := wait.PollImmediate(DefaultPollInterval, DefaultPollTimeout, func() (bool, error) { @@ -1364,9 +1359,7 @@ func checkoutTag(tagName string) error { func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterConfig) error { glog.Infof("begin to deploy adhoc backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - defer func() { - glog.Infof("deploy adhoc backup end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - }() + sets := map[string]string{ "name": info.BackupPVC, "mode": "backup", @@ -1391,9 +1384,6 @@ func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterConfig) error { func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterConfig) error { glog.Infof("begin to clean adhoc backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - defer func() { - glog.Infof("deploy clean backup end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - }() jobName := fmt.Sprintf("%s-%s", info.ClusterName, info.BackupPVC) fn := func() (bool, error) { @@ -1420,9 +1410,7 @@ func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterConfig) error { func (oa *operatorActions) Restore(from *TidbClusterConfig, to *TidbClusterConfig) error { glog.Infof("begin to deploy restore cluster[%s] namespace[%s]", from.ClusterName, from.Namespace) - defer func() { - glog.Infof("deploy restore end cluster[%s] namespace[%s]", to.ClusterName, to.Namespace) - }() + sets := map[string]string{ "name": to.BackupPVC, "mode": "restore", @@ -1447,10 +1435,6 @@ func (oa *operatorActions) Restore(from *TidbClusterConfig, to *TidbClusterConfi func (oa *operatorActions) CheckRestore(from *TidbClusterConfig, to *TidbClusterConfig) error { glog.Infof("begin to check restore backup cluster[%s] namespace[%s]", from.ClusterName, from.Namespace) - defer func() { - glog.Infof("check restore end cluster[%s] namespace[%s]", to.ClusterName, to.Namespace) - }() - jobName := fmt.Sprintf("%s-restore-%s", to.ClusterName, from.BackupPVC) fn := func() (bool, error) { job, err := oa.kubeCli.BatchV1().Jobs(to.Namespace).Get(jobName, metav1.GetOptions{}) @@ -1570,9 +1554,6 @@ func releaseIsExist(err error) bool { func (oa *operatorActions) DeployScheduledBackup(info *TidbClusterConfig) error { glog.Infof("begin to deploy scheduled backup") - defer func() { - glog.Infof("deploy shceduled backup end") - }() cron := fmt.Sprintf("'*/1 * * * *'") sets := map[string]string{ @@ -1600,9 +1581,6 @@ func (oa *operatorActions) DeployScheduledBackup(info *TidbClusterConfig) error func (oa *operatorActions) CheckScheduledBackup(info *TidbClusterConfig) error { glog.Infof("begin to check scheduler backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - defer func() { - glog.Infof("deploy check scheduler end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - }() jobName := fmt.Sprintf("%s-scheduled-backup", info.ClusterName) fn := func() (bool, error) { @@ -1764,9 +1742,7 @@ func (info *TidbClusterConfig) FullName() string { func (oa *operatorActions) DeployIncrementalBackup(from *TidbClusterConfig, to *TidbClusterConfig) error { glog.Infof("begin to deploy incremental backup cluster[%s] namespace[%s]", from.ClusterName, from.Namespace) - defer func() { - glog.Infof("deploy incremental backup end cluster[%s] namespace[%s]", to.ClusterName, to.Namespace) - }() + sets := map[string]string{ "binlog.pump.create": "true", "binlog.drainer.destDBType": "mysql", @@ -1791,9 +1767,6 @@ func (oa *operatorActions) DeployIncrementalBackup(from *TidbClusterConfig, to * func (oa *operatorActions) CheckIncrementalBackup(info *TidbClusterConfig) error { glog.Infof("begin to check incremental backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - defer func() { - glog.Infof("check incremental backup end cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) - }() pumpStatefulSetName := fmt.Sprintf("%s-pump", info.ClusterName) fn := func() (bool, error) { diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index b639d60c07..e26177b88d 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -153,14 +153,10 @@ func main() { oa.CheckTidbClusterStatusOrDie(cluster1) oa.CheckTidbClusterStatusOrDie(cluster2) - // TODO insert data - //for _, clusterInfo := range clusterInfos { - // go func() { - // if err = oa.BeginInsertDataTo(clusterInfo); err != nil { - // glog.Fatal(err) - // } - // }() - //} + //go func() { + // oa.BeginInsertDataTo(cluster1) + // oa.BeginInsertDataTo(cluster2) + //}() // TODO add DDL //var workloads []workload.Workload @@ -213,6 +209,7 @@ func main() { oa.CheckFailoverOrDie(allClusters, node) time.Sleep(3 * time.Minute) fta.StartNodeOrDie(physicalNode, node) + oa.CheckRecoverOrDie(allClusters) for _, cluster := range allClusters { oa.CheckTidbClusterStatusOrDie(cluster) } diff --git a/tests/failover.go b/tests/failover.go index 1c8740255c..ea1631909d 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -145,6 +145,46 @@ func (oa *operatorActions) CheckFailoverOrDie(clusters []*TidbClusterConfig, fau } } +func (oa *operatorActions) CheckRecover(cluster *TidbClusterConfig) (bool, error) { + tc, err := oa.cli.PingcapV1alpha1().TidbClusters(cluster.Namespace).Get(cluster.ClusterName, metav1.GetOptions{}) + if err != nil { + return false, nil + } + + if tc.Status.PD.FailureMembers != nil && len(tc.Status.PD.FailureMembers) > 0 { + glog.Infof("cluster: [%s]'s pd FailureMembers is not nil, continue to wait", cluster.FullName()) + return false, nil + } + + if tc.Status.TiDB.FailureMembers != nil && len(tc.Status.TiDB.FailureMembers) > 0 { + glog.Infof("cluster: [%s]'s tidb FailureMembers is not nil, continue to wait", cluster.FullName()) + return false, nil + } + + return true, nil +} + +func (oa *operatorActions) CheckRecoverOrDie(clusters []*TidbClusterConfig) { + if err := wait.Poll(DefaultPollInterval, DefaultPollTimeout, func() (bool, error) { + var passes []bool + for i := range clusters { + pass, err := oa.CheckRecover(clusters[i]) + if err != nil { + return pass, err + } + passes = append(passes, pass) + } + for _, pass := range passes { + if !pass { + return false, nil + } + } + return true, nil + }); err != nil { + panic("failed to check recover") + } +} + func (oa *operatorActions) pdFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluster) bool { failure := false for _, failureMember := range tc.Status.PD.FailureMembers { From fdd44a7097264c10e01b63e3a92920aad310f790 Mon Sep 17 00:00:00 2001 From: weekface Date: Thu, 4 Apr 2019 15:01:03 +0800 Subject: [PATCH 4/4] Update tests/actions.go --- tests/actions.go | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/actions.go b/tests/actions.go index 33082ccc3f..ddbc3774b6 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -107,7 +107,6 @@ type OperatorActions interface { CreateSecret(info *TidbClusterConfig) error GetPodUIDMap(info *TidbClusterConfig) (map[string]types.UID, error) GetNodeMap(info *TidbClusterConfig, component string) (map[string][]string, error) - getBackupDir(info *TidbClusterConfig) ([]string, error) CheckFailoverPending(info *TidbClusterConfig, faultPoint *time.Time) (bool, error) CheckFailoverPendingOrDie(clusters []*TidbClusterConfig, faultPoint *time.Time) CheckFailover(info *TidbClusterConfig, faultNode string) (bool, error)