From 01aeea3dc7981aa49faf7b2f279d4d82263e9987 Mon Sep 17 00:00:00 2001 From: Aylei Date: Tue, 16 Apr 2019 01:00:16 +0800 Subject: [PATCH 01/14] Support running stability test out of cluster Signed-off-by: Aylei --- tests/actions.go | 16 ++++++++-------- tests/cmd/stability/main.go | 11 +++++++++-- tests/config.go | 23 +++++++++++++++++++++++ tests/pkg/client/client.go | 30 ++++++++++++++++++++++-------- 4 files changed, 62 insertions(+), 18 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 35d6d52181..45f015d909 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -234,10 +234,10 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { func (oa *operatorActions) DeployOperator(info *OperatorConfig) error { if info.Tag != "e2e" { - if err := cloneOperatorRepo(); err != nil { + if err := oa.cloneOperatorRepo(); err != nil { return err } - if err := checkoutTag(info.Tag); err != nil { + if err := oa.checkoutTag(info.Tag); err != nil { return err } } @@ -280,7 +280,7 @@ func (oa *operatorActions) CleanOperatorOrDie(info *OperatorConfig) { } func (oa *operatorActions) UpgradeOperator(info *OperatorConfig) error { - if err := checkoutTag(info.Tag); err != nil { + if err := oa.checkoutTag(info.Tag); err != nil { return err } @@ -1329,8 +1329,8 @@ func releaseIsNotFound(err error) bool { return strings.Contains(err.Error(), "not found") } -func cloneOperatorRepo() error { - cmd := fmt.Sprintf("git clone https://github.com/pingcap/tidb-operator.git /tidb-operator") +func (oa *operatorActions) cloneOperatorRepo() error { + cmd := fmt.Sprintf("git clone https://github.com/pingcap/tidb-operator.git %s", oa.cfg.GitRepoDir) glog.Info(cmd) res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() if err != nil && !strings.Contains(string(res), "already exists") { @@ -1340,15 +1340,15 @@ func cloneOperatorRepo() error { return nil } -func checkoutTag(tagName string) error { - cmd := fmt.Sprintf(`cd /tidb-operator && +func (oa *operatorActions) checkoutTag(tagName string) error { + cmd := fmt.Sprintf(`cd %s && git stash -u && git checkout %s && mkdir -p /charts/%s && cp -rf charts/tidb-operator /charts/%s/tidb-operator && cp -rf charts/tidb-cluster /charts/%s/tidb-cluster && cp -rf charts/tidb-backup /charts/%s/tidb-backup`, - tagName, tagName, tagName, tagName, tagName) + oa.cfg.GitRepoDir, tagName, tagName, tagName, tagName, tagName) glog.Info(cmd) res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() if err != nil { diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index f3c105bf67..aba165e279 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -15,6 +15,8 @@ package main import ( "fmt" + "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" + "k8s.io/client-go/kubernetes" "net/http" _ "net/http/pprof" "time" @@ -31,13 +33,18 @@ import ( func main() { logs.InitLogs() defer logs.FlushLogs() - go func() { glog.Info(http.ListenAndServe("localhost:6060", nil)) }() conf := tests.ParseConfigOrDie() - cli, kubeCli := client.NewCliOrDie() + var cli versioned.Interface + var kubeCli kubernetes.Interface + if conf.OutOfCluster { + cli, kubeCli = client.NewOutOfClusterCliOrDie(conf.KubeConfigPath) + } else { + cli, kubeCli = client.NewCliOrDie() + } oa := tests.NewOperatorActions(cli, kubeCli, conf) fta := tests.NewFaultTriggerAction(cli, kubeCli, conf) fta.CheckAndRecoverEnvOrDie() diff --git a/tests/config.go b/tests/config.go index ff182258c7..ee98f02889 100644 --- a/tests/config.go +++ b/tests/config.go @@ -4,6 +4,8 @@ import ( "flag" "fmt" "io/ioutil" + "os" + "path/filepath" "strings" "github.com/golang/glog" @@ -22,6 +24,11 @@ type Config struct { Nodes []Nodes `yaml:"nodes" json:"nodes"` ETCDs []Nodes `yaml:"etcds" json:"etcds"` APIServers []Nodes `yaml:"apiservers" json:"apiservers"` + + // For local test + GitRepoDir string `yaml:"git_repo_dir" json:"gir_repo_dir"` + OutOfCluster bool `yaml:"out_of_cluster" json:"out_of_cluster"` + KubeConfigPath string `yaml:"kube_config_path" json:"kube_config_path"` } // Nodes defines a series of nodes that belong to the same physical node. @@ -39,6 +46,15 @@ func NewConfig() *Config { flag.StringVar(&cfg.TidbVersions, "tidb-versions", "v2.1.3,v2.1.4", "tidb versions") flag.StringVar(&cfg.OperatorTag, "operator-tag", "master", "operator tag used to choose charts") flag.StringVar(&cfg.OperatorImage, "operator-image", "pingcap/tidb-operator:latest", "operator image") + flag.StringVar(&cfg.GitRepoDir, "git-repo-dir", "/tidb-operator", "local directory to which tidb-operator cloned") + if home := homeDir(); home != "" { + flag.StringVar(&cfg.KubeConfigPath, "kubeconfig", filepath.Join(home, ".kube", "config"), "absolute path to the kubeconfig file") + } else { + // cannot get homedir, kube config file path must be provided explicitly + flag.StringVar(&cfg.KubeConfigPath, "kubeconfig", "", "absolute path to the kubeconfig file") + } + flag.BoolVar(&cfg.OutOfCluster, "out-of-cluster", false, "whether stability test runs out of cluster.") + flag.Parse() return cfg } @@ -115,3 +131,10 @@ func (c *Config) GetUpgradeTidbVersionsOrDie() []string { return versions } + +func homeDir() string { + if h := os.Getenv("HOME"); h != "" { + return h + } + return os.Getenv("USERPROFILE") // for windows +} diff --git a/tests/pkg/client/client.go b/tests/pkg/client/client.go index 9c9e6f4354..80be376ed1 100644 --- a/tests/pkg/client/client.go +++ b/tests/pkg/client/client.go @@ -17,18 +17,17 @@ func NewCliOrDie() (versioned.Interface, kubernetes.Interface) { panic(err) } - cfg.Timeout = 30 * time.Second - cli, err := versioned.NewForConfig(cfg) - if err != nil { - panic(err) - } + return buildClientsOrDie(cfg) +} - kubeCli, err := kubernetes.NewForConfig(cfg) +func NewOutOfClusterCliOrDie(kubeconfig string) (versioned.Interface, kubernetes.Interface) { + // TODO: support context selection, current context will be used now + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfig) if err != nil { - panic(err) + panic(err.Error()) } - return cli, kubeCli + return buildClientsOrDie(cfg) } var ( @@ -74,3 +73,18 @@ func LoadConfig() (*rest.Config, error) { cfg, err := clientcmd.BuildConfigFromFlags(masterUrl, kubeconfigPath) return cfg, errors.Trace(err) } + +func buildClientsOrDie(cfg *rest.Config) (versioned.Interface, kubernetes.Interface) { + cfg.Timeout = 30 * time.Second + cli, err := versioned.NewForConfig(cfg) + if err != nil { + panic(err) + } + + kubeCli, err := kubernetes.NewForConfig(cfg) + if err != nil { + panic(err) + } + + return cli, kubeCli +} From 3524ce1d5bd175213171ee563c8ed273b9119bdb Mon Sep 17 00:00:00 2001 From: Aylei Date: Tue, 16 Apr 2019 12:01:43 +0800 Subject: [PATCH 02/14] Gofmt Signed-off-by: Aylei --- tests/config.go | 6 +++--- tests/fault.go | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/config.go b/tests/config.go index ee98f02889..080107a4be 100644 --- a/tests/config.go +++ b/tests/config.go @@ -26,9 +26,9 @@ type Config struct { APIServers []Nodes `yaml:"apiservers" json:"apiservers"` // For local test - GitRepoDir string `yaml:"git_repo_dir" json:"gir_repo_dir"` - OutOfCluster bool `yaml:"out_of_cluster" json:"out_of_cluster"` - KubeConfigPath string `yaml:"kube_config_path" json:"kube_config_path"` + GitRepoDir string `yaml:"git_repo_dir" json:"gir_repo_dir"` + OutOfCluster bool `yaml:"out_of_cluster" json:"out_of_cluster"` + KubeConfigPath string `yaml:"kube_config_path" json:"kube_config_path"` } // Nodes defines a series of nodes that belong to the same physical node. diff --git a/tests/fault.go b/tests/fault.go index 0f97efdf26..d1b2862507 100644 --- a/tests/fault.go +++ b/tests/fault.go @@ -110,7 +110,7 @@ func (fa *faultTriggerActions) CheckAndRecoverEnv() error { } func (fa *faultTriggerActions) CheckAndRecoverEnvOrDie() { - if err:=fa.CheckAndRecoverEnv();err!=nil{ + if err := fa.CheckAndRecoverEnv(); err != nil { glog.Fatal(err) } } From 539475c7b1c15fab8a2c0cea93c0c1e254eb7304 Mon Sep 17 00:00:00 2001 From: Aylei Date: Tue, 16 Apr 2019 14:07:03 +0800 Subject: [PATCH 03/14] Address comments Signed-off-by: Aylei --- tests/actions.go | 4 +-- tests/cmd/stability/main.go | 12 ++------- tests/config.go | 24 +++-------------- tests/pkg/client/client.go | 51 ++++++++++++++++++++++++++++--------- 4 files changed, 46 insertions(+), 45 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index a03a3b4a2f..02af71ef7a 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -1319,7 +1319,7 @@ func releaseIsNotFound(err error) bool { } func (oa *operatorActions) cloneOperatorRepo() error { - cmd := fmt.Sprintf("git clone https://github.com/pingcap/tidb-operator.git %s", oa.cfg.GitRepoDir) + cmd := fmt.Sprintf("git clone https://github.com/pingcap/tidb-operator.git %s", oa.cfg.OperatorRepoDir) glog.Info(cmd) res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() if err != nil && !strings.Contains(string(res), "already exists") { @@ -1337,7 +1337,7 @@ func (oa *operatorActions) checkoutTag(tagName string) error { cp -rf charts/tidb-operator /charts/%s/tidb-operator && cp -rf charts/tidb-cluster /charts/%s/tidb-cluster && cp -rf charts/tidb-backup /charts/%s/tidb-backup`, - oa.cfg.GitRepoDir, tagName, tagName, tagName, tagName, tagName) + oa.cfg.OperatorRepoDir, tagName, tagName, tagName, tagName, tagName) glog.Info(cmd) res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() if err != nil { diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index aba165e279..e9a5733eef 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -15,19 +15,17 @@ package main import ( "fmt" - "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" - "k8s.io/client-go/kubernetes" "net/http" _ "net/http/pprof" "time" "github.com/golang/glog" "github.com/jinzhu/copier" + "github.com/pingcap/tidb-operator/tests/pkg/client" "k8s.io/apiserver/pkg/util/logs" "github.com/pingcap/tidb-operator/tests" "github.com/pingcap/tidb-operator/tests/backup" - "github.com/pingcap/tidb-operator/tests/pkg/client" ) func main() { @@ -38,13 +36,7 @@ func main() { }() conf := tests.ParseConfigOrDie() - var cli versioned.Interface - var kubeCli kubernetes.Interface - if conf.OutOfCluster { - cli, kubeCli = client.NewOutOfClusterCliOrDie(conf.KubeConfigPath) - } else { - cli, kubeCli = client.NewCliOrDie() - } + cli, kubeCli := client.NewCliOrDie() oa := tests.NewOperatorActions(cli, kubeCli, conf) fta := tests.NewFaultTriggerAction(cli, kubeCli, conf) fta.CheckAndRecoverEnvOrDie() diff --git a/tests/config.go b/tests/config.go index 080107a4be..57e5352299 100644 --- a/tests/config.go +++ b/tests/config.go @@ -4,12 +4,10 @@ import ( "flag" "fmt" "io/ioutil" - "os" - "path/filepath" "strings" "github.com/golang/glog" - yaml "gopkg.in/yaml.v2" + "gopkg.in/yaml.v2" ) // Config defines the config of operator tests @@ -26,9 +24,7 @@ type Config struct { APIServers []Nodes `yaml:"apiservers" json:"apiservers"` // For local test - GitRepoDir string `yaml:"git_repo_dir" json:"gir_repo_dir"` - OutOfCluster bool `yaml:"out_of_cluster" json:"out_of_cluster"` - KubeConfigPath string `yaml:"kube_config_path" json:"kube_config_path"` + OperatorRepoDir string `yaml:"git_repo_dir" json:"gir_repo_dir"` } // Nodes defines a series of nodes that belong to the same physical node. @@ -46,14 +42,7 @@ func NewConfig() *Config { flag.StringVar(&cfg.TidbVersions, "tidb-versions", "v2.1.3,v2.1.4", "tidb versions") flag.StringVar(&cfg.OperatorTag, "operator-tag", "master", "operator tag used to choose charts") flag.StringVar(&cfg.OperatorImage, "operator-image", "pingcap/tidb-operator:latest", "operator image") - flag.StringVar(&cfg.GitRepoDir, "git-repo-dir", "/tidb-operator", "local directory to which tidb-operator cloned") - if home := homeDir(); home != "" { - flag.StringVar(&cfg.KubeConfigPath, "kubeconfig", filepath.Join(home, ".kube", "config"), "absolute path to the kubeconfig file") - } else { - // cannot get homedir, kube config file path must be provided explicitly - flag.StringVar(&cfg.KubeConfigPath, "kubeconfig", "", "absolute path to the kubeconfig file") - } - flag.BoolVar(&cfg.OutOfCluster, "out-of-cluster", false, "whether stability test runs out of cluster.") + flag.StringVar(&cfg.OperatorRepoDir, "operator-repo-dir", "/tidb-operator", "local directory to which tidb-operator cloned") flag.Parse() return cfg @@ -131,10 +120,3 @@ func (c *Config) GetUpgradeTidbVersionsOrDie() []string { return versions } - -func homeDir() string { - if h := os.Getenv("HOME"); h != "" { - return h - } - return os.Getenv("USERPROFILE") // for windows -} diff --git a/tests/pkg/client/client.go b/tests/pkg/client/client.go index 80be376ed1..9a662e633c 100644 --- a/tests/pkg/client/client.go +++ b/tests/pkg/client/client.go @@ -1,6 +1,11 @@ package client import ( + "flag" + "fmt" + "os" + "os/user" + "path/filepath" "time" "github.com/juju/errors" @@ -11,8 +16,21 @@ import ( "k8s.io/client-go/tools/clientcmd" ) +var ( + masterUrl string + kubeconfigPath string +) + +func init() { + flag.StringVar(&kubeconfigPath, "kubeconfig", "", + "path to a kubeconfig. Only required if out-of-cluster.") + flag.StringVar(&masterUrl, "master", "", + "address of the Kubernetes API server. Overrides any value in kubeconfig. "+ + "Only required if out-of-cluster.") +} + func NewCliOrDie() (versioned.Interface, kubernetes.Interface) { - cfg, err := rest.InClusterConfig() + cfg, err := GetConfig() if err != nil { panic(err) } @@ -20,21 +38,30 @@ func NewCliOrDie() (versioned.Interface, kubernetes.Interface) { return buildClientsOrDie(cfg) } -func NewOutOfClusterCliOrDie(kubeconfig string) (versioned.Interface, kubernetes.Interface) { - // TODO: support context selection, current context will be used now - cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfig) - if err != nil { - panic(err.Error()) +func GetConfig() (*rest.Config, error) { + // If kubeconfigPath provided, use that + if len(kubeconfigPath) > 0 { + return clientcmd.BuildConfigFromFlags(masterUrl, kubeconfigPath) + } + // If an env variable is specified with the config locaiton, use that + if len(os.Getenv("KUBECONFIG")) > 0 { + return clientcmd.BuildConfigFromFlags(masterUrl, os.Getenv("KUBECONFIG")) + } + // If no explicit location, try the in-cluster config + if c, err := rest.InClusterConfig(); err == nil { + return c, nil + } + // If no in-cluster config, try the default location in the user's home directory + if usr, err := user.Current(); err == nil { + if c, err := clientcmd.BuildConfigFromFlags( + "", filepath.Join(usr.HomeDir, ".kube", "config")); err == nil { + return c, nil + } } - return buildClientsOrDie(cfg) + return nil, fmt.Errorf("could not locate a kubeconfig") } -var ( - masterUrl string - kubeconfigPath string -) - type Client interface { kubernetes.Interface PingcapV1alpha1() v1alpha1.PingcapV1alpha1Interface From bd63d8035d7c10965ccad7c19e8dd5b3be3ba0f7 Mon Sep 17 00:00:00 2001 From: Aylei Date: Tue, 16 Apr 2019 14:12:20 +0800 Subject: [PATCH 04/14] Address review comments Signed-off-by: Aylei --- tests/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/config.go b/tests/config.go index 57e5352299..7af7be0a73 100644 --- a/tests/config.go +++ b/tests/config.go @@ -24,7 +24,7 @@ type Config struct { APIServers []Nodes `yaml:"apiservers" json:"apiservers"` // For local test - OperatorRepoDir string `yaml:"git_repo_dir" json:"gir_repo_dir"` + OperatorRepoDir string `yaml:"operator_repo_dir" json:"operator_repo_dir"` } // Nodes defines a series of nodes that belong to the same physical node. From 2233ceef37cbe9b9059f1f8943c316d7119f2fb4 Mon Sep 17 00:00:00 2001 From: Aylei Date: Tue, 16 Apr 2019 14:23:01 +0800 Subject: [PATCH 05/14] Add local statbility test doc Signed-off-by: Aylei --- docs/local-stability-test.md | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 docs/local-stability-test.md diff --git a/docs/local-stability-test.md b/docs/local-stability-test.md new file mode 100644 index 0000000000..4ed84d1187 --- /dev/null +++ b/docs/local-stability-test.md @@ -0,0 +1,39 @@ +# How to: run stability test in your local environment + +Deploy & witness flow can be tedious when developing stability-test, this PR introduce the ability to run stability-test out of the cluster(your local machine, usually) while still operating the remote cluster. + +### TL;DR: +```shell +$ telepresence --new-deployment ${POD_NAME} +$ go build -o stability ./tests/cmd/stability/main.go +$ ./stability --operator-repo-dir=${ABITRARY_EMPTY_DIR_TO_CLONE_OPERATOR_REPO} +``` + +### Explained + +Generally we have three problems to solve: + +1. **Out of cluster client**: Now we try to load configs in the following order: + * if `kubeconfig` command line option provided, use it + * if `KUBECONFIG` env variable set, use it + * try loading `InClusterConfig()` + * if no `InClusterConfig()` provided, try loading kubeconfig file from default location (`~/.kube/config`) +so typically you will get right client configs if you have proper `~/.kube/config` in your local environment. +2. **Privilege issue**: If you don't want to or cannot run stability test with root privilege, change the working dir or create it in advance: + * git repo dir can be overridden by option `--git-repo-dir=xxxx`, but helm dir must be created manually. +```shell +# helm dir +$ mkdir /charts +$ chmod 777 /charts +# git repo dir if you don't set command line option +$ mkdir /tidb-operator +$ chmod 777 /tidb-operator +``` +3. **DNS and network issue**: Two-way proxy using Telepresence. We cannot resolve cluster dns name and access cluster ip easily, `telepresence` helps with that, it creates a proxy pod in the cluster and open a vpn connection to kubernetes cluster via this pod. Just run ([full documentations](https://www.telepresence.io/reference/install)): +```shell +$ brew cask install osxfuse +$ brew install datawire/blackbird/telepresence +$ telepresence --new-deployment ${POD_NAME} +``` +**PS**: If you cannot resolve cluster dns names after set up, try clear DNS cache. +**PSS**: Typically you can't use telepresence VPN mode with other VPNs (of course SSR is ok). \ No newline at end of file From 1c2fc0483f704152adc4309b359482edc71497af Mon Sep 17 00:00:00 2001 From: Aylei Date: Tue, 16 Apr 2019 14:24:29 +0800 Subject: [PATCH 06/14] Refine documents Signed-off-by: Aylei --- docs/local-stability-test.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/local-stability-test.md b/docs/local-stability-test.md index 4ed84d1187..dec4ee3880 100644 --- a/docs/local-stability-test.md +++ b/docs/local-stability-test.md @@ -1,6 +1,6 @@ # How to: run stability test in your local environment -Deploy & witness flow can be tedious when developing stability-test, this PR introduce the ability to run stability-test out of the cluster(your local machine, usually) while still operating the remote cluster. +Deploy & witness flow can be tedious when developing stability-test, this document introduce that how to run stability-test out of the cluster(your local machine, usually) while still operating the remote cluster. ### TL;DR: ```shell From 906aa58ac62a43fca6acc254b115d78689af3fe4 Mon Sep 17 00:00:00 2001 From: Aylei Date: Tue, 16 Apr 2019 16:14:35 +0800 Subject: [PATCH 07/14] Temp --- tests/actions.go | 16 ++++++++++++++++ tests/backup/backupcase.go | 23 ++++++++++++----------- tests/cmd/stability/main.go | 16 ++++++++++++---- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 02af71ef7a..8ffae00088 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -87,7 +87,9 @@ type OperatorActions interface { CheckTidbClusterStatus(info *TidbClusterConfig) error CheckTidbClusterStatusOrDie(info *TidbClusterConfig) BeginInsertDataTo(info *TidbClusterConfig) error + BeginInsertDataToOrDie(info *TidbClusterConfig) StopInsertDataTo(info *TidbClusterConfig) error + StopInsertDataToOrDie(info *TidbClusterConfig) ScaleTidbCluster(info *TidbClusterConfig) error ScaleTidbClusterOrDie(info *TidbClusterConfig) CheckScaleInSafely(info *TidbClusterConfig) error @@ -490,11 +492,25 @@ func (oa *operatorActions) BeginInsertDataTo(info *TidbClusterConfig) error { return info.blockWriter.Start(db) } +func (oa *operatorActions) BeginInsertDataToOrDie(info *TidbClusterConfig) { + err := oa.BeginInsertDataTo(info) + if err != nil { + panic(err) + } +} + func (oa *operatorActions) StopInsertDataTo(info *TidbClusterConfig) error { info.blockWriter.Stop() return nil } +func (oa *operatorActions) StopInsertDataToOrDie(info *TidbClusterConfig) { + err := oa.StopInsertDataTo(info) + if err != nil { + panic(err) + } +} + func chartPath(name string, tag string) string { return "/charts/" + tag + "/" + name } diff --git a/tests/backup/backupcase.go b/tests/backup/backupcase.go index 71b13c6b9c..eb335504d5 100644 --- a/tests/backup/backupcase.go +++ b/tests/backup/backupcase.go @@ -36,13 +36,14 @@ func NewBackupCase(operator tests.OperatorActions, srcCluster *tests.TidbCluster } func (bc *BackupCase) Run() error { - //err := bc.operator.StopInsertDataTo(bc.srcCluster) - //if err != nil { - // glog.Errorf("cluster:[%s] stop insert data failed,error: %v", bc.srcCluster.ClusterName, err) - // return err - //} - err := bc.operator.DeployAdHocBackup(bc.srcCluster) + err := bc.operator.StopInsertDataTo(bc.srcCluster) + if err != nil { + glog.Errorf("cluster:[%s] stop insert data failed,error: %v", bc.srcCluster.ClusterName, err) + return err + } + + err = bc.operator.DeployAdHocBackup(bc.srcCluster) if err != nil { glog.Errorf("cluster:[%s] deploy happen error: %v", bc.srcCluster.ClusterName, err) return err @@ -119,11 +120,11 @@ func (bc *BackupCase) Run() error { return fmt.Errorf("cluster:[%s] the src cluster data[%d] is not equals des cluster data[%d]", bc.srcCluster.FullName(), srcCount, desCount) } - //err = bc.operator.BeginInsertDataTo(bc.srcCluster) - //if err != nil { - // glog.Errorf("cluster:[%s] begin insert data failed,error: %v", bc.srcCluster.ClusterName, err) - // return err - //} + err = bc.operator.BeginInsertDataTo(bc.srcCluster) + if err != nil { + glog.Errorf("cluster:[%s] begin insert data failed,error: %v", bc.srcCluster.ClusterName, err) + return err + } return nil } diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index e9a5733eef..9c8c4eefde 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -154,10 +154,18 @@ func main() { oa.CheckTidbClusterStatusOrDie(cluster1) oa.CheckTidbClusterStatusOrDie(cluster2) - //go func() { - // oa.BeginInsertDataTo(cluster1) - // oa.BeginInsertDataTo(cluster2) - //}() + defer func() { + if err := oa.StopInsertDataTo(cluster1); err != nil { + glog.Errorf("cluster:[%s] stop insert data failed,error: %v", cluster1.ClusterName, err) + } + if err := oa.StopInsertDataTo(cluster2); err != nil { + glog.Errorf("cluster:[%s] stop insert data failed,error: %v", cluster2.ClusterName, err) + } + }() + go func() { + oa.BeginInsertDataToOrDie(cluster1) + oa.BeginInsertDataToOrDie(cluster2) + }() // TODO add DDL //var workloads []workload.Workload From 13135e2ff3b910a2c07e8fe483c7b3fd04abddc2 Mon Sep 17 00:00:00 2001 From: Aylei Date: Tue, 16 Apr 2019 21:01:41 +0800 Subject: [PATCH 08/14] Enable block wirter write pressure in stability test Signed-off-by: Aylei --- tests/actions.go | 17 +++++------------ tests/cmd/stability/main.go | 6 ++---- tests/config.go | 20 +++++++++++++++++++- tests/pkg/blockwriter/blockwriter.go | 26 ++++++++++++++------------ 4 files changed, 40 insertions(+), 29 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 8ffae00088..c8566ae348 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -48,11 +48,6 @@ import ( ) const ( - defaultTableNum int = 64 - defaultConcurrency = 512 - defaultBatchSize = 100 - defaultRawSize = 100 - period = 5 * time.Minute ) @@ -323,12 +318,8 @@ func (oa *operatorActions) DeployTidbCluster(info *TidbClusterConfig) error { } // init blockWriter case - info.blockWriter = blockwriter.NewBlockWriterCase(blockwriter.Config{ - TableNum: defaultTableNum, - Concurrency: defaultConcurrency, - BatchSize: defaultBatchSize, - RawSize: defaultRawSize, - }) + info.blockWriter = blockwriter.NewBlockWriterCase(oa.cfg.BlockWriter) + info.blockWriter.ClusterName = info.ClusterName return nil } @@ -484,7 +475,9 @@ func (oa *operatorActions) CheckTidbClusterStatusOrDie(info *TidbClusterConfig) func (oa *operatorActions) BeginInsertDataTo(info *TidbClusterConfig) error { dsn := getDSN(info.Namespace, info.ClusterName, "test", info.Password) - db, err := util.OpenDB(dsn, defaultConcurrency) + glog.Infof("[%s] [%s] open TiDB connections, concurrency: %d", + info.blockWriter, info.ClusterName, oa.cfg.BlockWriter.Concurrency) + db, err := util.OpenDB(dsn, oa.cfg.BlockWriter.Concurrency) if err != nil { return err } diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index 9c8c4eefde..74ad75b42d 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -162,10 +162,8 @@ func main() { glog.Errorf("cluster:[%s] stop insert data failed,error: %v", cluster2.ClusterName, err) } }() - go func() { - oa.BeginInsertDataToOrDie(cluster1) - oa.BeginInsertDataToOrDie(cluster2) - }() + go oa.BeginInsertDataToOrDie(cluster1) + go oa.BeginInsertDataToOrDie(cluster2) // TODO add DDL //var workloads []workload.Workload diff --git a/tests/config.go b/tests/config.go index 7af7be0a73..e66b844572 100644 --- a/tests/config.go +++ b/tests/config.go @@ -3,6 +3,7 @@ package tests import ( "flag" "fmt" + "github.com/pingcap/tidb-operator/tests/pkg/blockwriter" "io/ioutil" "strings" @@ -10,6 +11,13 @@ import ( "gopkg.in/yaml.v2" ) +const ( + defaultTableNum int = 64 + defaultConcurrency = 512 + defaultBatchSize = 100 + defaultRawSize = 100 +) + // Config defines the config of operator tests type Config struct { configFile string @@ -23,6 +31,9 @@ type Config struct { ETCDs []Nodes `yaml:"etcds" json:"etcds"` APIServers []Nodes `yaml:"apiservers" json:"apiservers"` + // Block writer + BlockWriter blockwriter.Config `yaml:"block_writer,omitempty"` + // For local test OperatorRepoDir string `yaml:"operator_repo_dir" json:"operator_repo_dir"` } @@ -35,7 +46,14 @@ type Nodes struct { // NewConfig creates a new config. func NewConfig() *Config { - cfg := &Config{} + cfg := &Config{ + BlockWriter: blockwriter.Config{ + TableNum: defaultTableNum, + Concurrency: defaultConcurrency, + BatchSize: defaultBatchSize, + RawSize: defaultRawSize, + }, + } flag.StringVar(&cfg.configFile, "config", "", "Config file") flag.StringVar(&cfg.LogDir, "log-dir", "/logDir", "log directory") flag.IntVar(&cfg.FaultTriggerPort, "fault-trigger-port", 23332, "the http port of fault trigger service") diff --git a/tests/pkg/blockwriter/blockwriter.go b/tests/pkg/blockwriter/blockwriter.go index f7604dc9b4..7331a480ea 100644 --- a/tests/pkg/blockwriter/blockwriter.go +++ b/tests/pkg/blockwriter/blockwriter.go @@ -41,15 +41,17 @@ type BlockWriterCase struct { isInit uint32 stopChan chan struct{} + ClusterName string + sync.RWMutex } // Config defines the config of BlockWriterCase type Config struct { - TableNum int - Concurrency int - BatchSize int - RawSize int + TableNum int `yaml:"table_num" json:"table_num"` + Concurrency int `yaml:"concurrency" json:"concurrency"` + BatchSize int `yaml:"batch_size" json:"batch_size"` + RawSize int `yaml:"raw_size" json:"raw_size"` } type blockWriter struct { @@ -90,7 +92,7 @@ func (c *BlockWriterCase) newBlockWriter() *blockWriter { func (c *BlockWriterCase) generateQuery(ctx context.Context, queryChan chan []string, wg *sync.WaitGroup) { defer func() { - glog.Infof("[%s] [action: generate Query] stopped", c) + glog.Infof("[%s] [%s] [action: generate Query] stopped", c, c.ClusterName) wg.Done() }() @@ -121,7 +123,7 @@ func (c *BlockWriterCase) generateQuery(ctx context.Context, queryChan chan []st if len(queryChan) < queryChanSize { queryChan <- querys } else { - glog.Infof("[%s] [action: generate Query] query channel is full, sleep 10 seconds", c) + glog.Infof("[%s] [%s] [action: generate Query] query channel is full, sleep 10 seconds", c, c.ClusterName) util.Sleep(ctx, 10*time.Second) } } @@ -131,7 +133,7 @@ func (c *BlockWriterCase) generateQuery(ctx context.Context, queryChan chan []st func (bw *blockWriter) batchExecute(db *sql.DB, query string) error { _, err := db.Exec(query) if err != nil { - glog.V(4).Infof("[block_writer] exec sql [%s] failed, err: %v", query, err) + glog.V(4).Infof("[%s] exec sql [%s] failed, err: %v", query, err) return err } @@ -169,10 +171,10 @@ func (bw *blockWriter) run(ctx context.Context, db *sql.DB, queryChan chan []str // Initialize inits case func (c *BlockWriterCase) initialize(db *sql.DB) error { - glog.Infof("[%s] start to init...", c) + glog.Infof("[%s] [%s] start to init...", c, c.ClusterName) defer func() { atomic.StoreUint32(&c.isInit, 1) - glog.Infof("[%s] init end...", c) + glog.Infof("[%s] [%s] init end...", c, c.ClusterName) }() for i := 0; i < c.cfg.TableNum; i++ { @@ -210,14 +212,14 @@ func (c *BlockWriterCase) initialize(db *sql.DB) error { // Start starts to run cases func (c *BlockWriterCase) Start(db *sql.DB) error { if !atomic.CompareAndSwapUint32(&c.isRunning, 0, 1) { - err := fmt.Errorf("[%s] is running, you can't start it again", c) + err := fmt.Errorf("[%s] [%s] is running, you can't start it again", c, c.ClusterName) glog.Error(err) return nil } defer func() { c.RLock() - glog.Infof("[%s] stopped", c) + glog.Infof("[%s] [%s] stopped", c, c.ClusterName) atomic.SwapUint32(&c.isRunning, 0) }() @@ -227,7 +229,7 @@ func (c *BlockWriterCase) Start(db *sql.DB) error { } } - glog.Infof("[%s] start to execute case...", c) + glog.Infof("[%s] [%s] start to execute case...", c, c.ClusterName) var wg sync.WaitGroup From f3921951b66ca6f547a066222770bfe1362555fa Mon Sep 17 00:00:00 2001 From: Aylei Date: Tue, 16 Apr 2019 21:06:39 +0800 Subject: [PATCH 09/14] Remove error merged file Signed-off-by: Aylei --- docs/local-stability-test.md | 39 ------------------------------------ 1 file changed, 39 deletions(-) delete mode 100644 docs/local-stability-test.md diff --git a/docs/local-stability-test.md b/docs/local-stability-test.md deleted file mode 100644 index dec4ee3880..0000000000 --- a/docs/local-stability-test.md +++ /dev/null @@ -1,39 +0,0 @@ -# How to: run stability test in your local environment - -Deploy & witness flow can be tedious when developing stability-test, this document introduce that how to run stability-test out of the cluster(your local machine, usually) while still operating the remote cluster. - -### TL;DR: -```shell -$ telepresence --new-deployment ${POD_NAME} -$ go build -o stability ./tests/cmd/stability/main.go -$ ./stability --operator-repo-dir=${ABITRARY_EMPTY_DIR_TO_CLONE_OPERATOR_REPO} -``` - -### Explained - -Generally we have three problems to solve: - -1. **Out of cluster client**: Now we try to load configs in the following order: - * if `kubeconfig` command line option provided, use it - * if `KUBECONFIG` env variable set, use it - * try loading `InClusterConfig()` - * if no `InClusterConfig()` provided, try loading kubeconfig file from default location (`~/.kube/config`) -so typically you will get right client configs if you have proper `~/.kube/config` in your local environment. -2. **Privilege issue**: If you don't want to or cannot run stability test with root privilege, change the working dir or create it in advance: - * git repo dir can be overridden by option `--git-repo-dir=xxxx`, but helm dir must be created manually. -```shell -# helm dir -$ mkdir /charts -$ chmod 777 /charts -# git repo dir if you don't set command line option -$ mkdir /tidb-operator -$ chmod 777 /tidb-operator -``` -3. **DNS and network issue**: Two-way proxy using Telepresence. We cannot resolve cluster dns name and access cluster ip easily, `telepresence` helps with that, it creates a proxy pod in the cluster and open a vpn connection to kubernetes cluster via this pod. Just run ([full documentations](https://www.telepresence.io/reference/install)): -```shell -$ brew cask install osxfuse -$ brew install datawire/blackbird/telepresence -$ telepresence --new-deployment ${POD_NAME} -``` -**PS**: If you cannot resolve cluster dns names after set up, try clear DNS cache. -**PSS**: Typically you can't use telepresence VPN mode with other VPNs (of course SSR is ok). \ No newline at end of file From 153f831207425ceef0d0140314f6fe72ade301e8 Mon Sep 17 00:00:00 2001 From: Aylei Date: Tue, 16 Apr 2019 22:23:06 +0800 Subject: [PATCH 10/14] Reduce default concurrency Signed-off-by: Aylei --- tests/backup/backupcase.go | 14 ++++++++------ tests/config.go | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/backup/backupcase.go b/tests/backup/backupcase.go index eb335504d5..f123cb3c49 100644 --- a/tests/backup/backupcase.go +++ b/tests/backup/backupcase.go @@ -37,6 +37,14 @@ func NewBackupCase(operator tests.OperatorActions, srcCluster *tests.TidbCluster func (bc *BackupCase) Run() error { + // pause write pressure during backup + defer func() { + go func() { + if err := bc.operator.BeginInsertDataTo(bc.srcCluster); err != nil { + glog.Errorf("cluster:[%s] begin insert data failed,error: %v", bc.srcCluster.ClusterName, err) + } + }() + }() err := bc.operator.StopInsertDataTo(bc.srcCluster) if err != nil { glog.Errorf("cluster:[%s] stop insert data failed,error: %v", bc.srcCluster.ClusterName, err) @@ -120,12 +128,6 @@ func (bc *BackupCase) Run() error { return fmt.Errorf("cluster:[%s] the src cluster data[%d] is not equals des cluster data[%d]", bc.srcCluster.FullName(), srcCount, desCount) } - err = bc.operator.BeginInsertDataTo(bc.srcCluster) - if err != nil { - glog.Errorf("cluster:[%s] begin insert data failed,error: %v", bc.srcCluster.ClusterName, err) - return err - } - return nil } diff --git a/tests/config.go b/tests/config.go index e66b844572..b2e28b58bd 100644 --- a/tests/config.go +++ b/tests/config.go @@ -13,7 +13,7 @@ import ( const ( defaultTableNum int = 64 - defaultConcurrency = 512 + defaultConcurrency = 128 defaultBatchSize = 100 defaultRawSize = 100 ) From e0717e8a2c1bf5824b08421f6daa5422bcdc16b8 Mon Sep 17 00:00:00 2001 From: Aylei Date: Tue, 16 Apr 2019 23:41:00 +0800 Subject: [PATCH 11/14] Post stability test events annotation to grafana Signed-off-by: Aylei --- tests/actions.go | 54 ++++++++++++++++++++++++++++ tests/failover.go | 1 + tests/fault.go | 2 -- tests/pkg/metrics/annotation_util.go | 12 +++---- 4 files changed, 61 insertions(+), 8 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index c8566ae348..78b612a79f 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -17,6 +17,7 @@ import ( "database/sql" "encoding/json" "fmt" + "github.com/pingcap/tidb-operator/tests/pkg/metrics" "io/ioutil" "net/http" "net/url" @@ -66,6 +67,8 @@ const ( getBackupDirPodName = "get-backup-dir" grafanaUsername = "admin" grafanaPassword = "admin" + metricsPort = 8090 + statbilityTestTag = "stability" ) type OperatorActions interface { @@ -119,6 +122,8 @@ type operatorActions struct { kubeCli kubernetes.Interface pdControl controller.PDControlInterface cfg *Config + + grafanaClient *metrics.Client } var _ = OperatorActions(&operatorActions{}) @@ -248,6 +253,7 @@ func (oa *operatorActions) DeployOperator(info *OperatorConfig) error { info.Namespace, info.OperatorHelmSetString(nil)) glog.Info(cmd) + res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput() if err != nil { return fmt.Errorf("failed to deploy operator: %v, %s", err, string(res)) @@ -294,6 +300,7 @@ func (oa *operatorActions) UpgradeOperator(info *OperatorConfig) error { func (oa *operatorActions) DeployTidbCluster(info *TidbClusterConfig) error { glog.Infof("begin to deploy tidb cluster cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) + oa.emitEvent(info, "DeployTidbCluster") namespace := &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ @@ -332,6 +339,7 @@ func (oa *operatorActions) DeployTidbClusterOrDie(info *TidbClusterConfig) { func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { glog.Infof("cleaning tidbcluster %s/%s", info.Namespace, info.ClusterName) + oa.emitEvent(info, "CleanTidbCluster") charts := []string{ info.ClusterName, @@ -474,6 +482,8 @@ func (oa *operatorActions) CheckTidbClusterStatusOrDie(info *TidbClusterConfig) } func (oa *operatorActions) BeginInsertDataTo(info *TidbClusterConfig) error { + oa.emitEvent(info, fmt.Sprintf("BeginInsertData: concurrency: %d", oa.cfg.BlockWriter.Concurrency)) + dsn := getDSN(info.Namespace, info.ClusterName, "test", info.Password) glog.Infof("[%s] [%s] open TiDB connections, concurrency: %d", info.blockWriter, info.ClusterName, oa.cfg.BlockWriter.Concurrency) @@ -493,6 +503,8 @@ func (oa *operatorActions) BeginInsertDataToOrDie(info *TidbClusterConfig) { } func (oa *operatorActions) StopInsertDataTo(info *TidbClusterConfig) error { + oa.emitEvent(info, "StopInsertData") + info.blockWriter.Stop() return nil } @@ -509,6 +521,8 @@ func chartPath(name string, tag string) string { } func (oa *operatorActions) ScaleTidbCluster(info *TidbClusterConfig) error { + oa.emitEvent(info, fmt.Sprintf("ScaleTidbCluster")) + cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s", info.ClusterName, chartPath("tidb-cluster", info.OperatorTag), info.TidbClusterHelmSetString(nil)) glog.Info("[SCALE] " + cmd) @@ -583,6 +597,8 @@ func (oa *operatorActions) CheckScaledCorrectly(info *TidbClusterConfig, podUIDs } func (oa *operatorActions) UpgradeTidbCluster(info *TidbClusterConfig) error { + oa.emitEvent(info, "UpgradeTidbCluster") + cmd := fmt.Sprintf("helm upgrade %s %s --set-string %s", info.ClusterName, chartPath("tidb-cluster", info.OperatorTag), info.TidbClusterHelmSetString(nil)) glog.Info("[UPGRADE] " + cmd) @@ -1316,6 +1332,16 @@ func (oa *operatorActions) checkGrafanaData(clusterInfo *TidbClusterConfig) erro if data.Status != "success" || len(data.Data.Result) < 1 { return fmt.Errorf("invalid response: status: %s, result: %v", data.Status, data.Data.Result) } + + // Grafana ready, init grafana client + if oa.grafanaClient == nil { + grafanaUrl := fmt.Sprintf("http://%s.%s:3000", svcName, ns) + client, err := metrics.NewClient(grafanaUrl, grafanaUsername, grafanaPassword, metricsPort) + if err != nil { + return err + } + oa.grafanaClient = client + } return nil } @@ -1357,6 +1383,7 @@ func (oa *operatorActions) checkoutTag(tagName string) error { } func (oa *operatorActions) DeployAdHocBackup(info *TidbClusterConfig) error { + oa.emitEvent(info, "DeployAdHocBackup") glog.Infof("begin to deploy adhoc backup cluster[%s] namespace[%s]", info.ClusterName, info.Namespace) sets := map[string]string{ @@ -1408,6 +1435,7 @@ func (oa *operatorActions) CheckAdHocBackup(info *TidbClusterConfig) error { } func (oa *operatorActions) Restore(from *TidbClusterConfig, to *TidbClusterConfig) error { + oa.emitEvent(to, fmt.Sprintf("RestoreBackup: source: %s", from.ClusterName)) glog.Infof("begin to deploy restore cluster[%s] namespace[%s]", from.ClusterName, from.Namespace) sets := map[string]string{ @@ -1552,6 +1580,7 @@ func releaseIsExist(err error) bool { } func (oa *operatorActions) DeployScheduledBackup(info *TidbClusterConfig) error { + oa.emitEvent(info, "DeploySchedulerBackup") glog.Infof("begin to deploy scheduled backup") cron := fmt.Sprintf("'*/1 * * * *'") @@ -1740,6 +1769,7 @@ func (info *TidbClusterConfig) FullName() string { } func (oa *operatorActions) DeployIncrementalBackup(from *TidbClusterConfig, to *TidbClusterConfig) error { + oa.emitEvent(from, fmt.Sprintf("DeployIncrementalBackup: slave: %s", to.ClusterName)) glog.Infof("begin to deploy incremental backup cluster[%s] namespace[%s]", from.ClusterName, from.Namespace) sets := map[string]string{ @@ -1906,3 +1936,27 @@ func (oa *operatorActions) drainerHealth(info *TidbClusterConfig, hostName strin } return len(healths.PumpPos) > 0 && healths.Synced } + +func (oa *operatorActions) emitEvent(info *TidbClusterConfig, event string) { + if oa.grafanaClient == nil { + glog.V(4).Infof("cluster:[%s] grafana client not ready, skip recording event %s.", + info.ClusterName, event) + return + } + + anno := metrics.Annotation{ + Text: event, + TimestampInMilliSec: time.Now().UnixNano() / int64(time.Millisecond), + + Tags: []string{ + statbilityTestTag, + fmt.Sprintf("cluster-%s", info.ClusterName), + fmt.Sprintf("ns-%s", info.Namespace), + }, + } + go func() { + if err := oa.grafanaClient.AddAnnotation(anno); err != nil { + glog.Errorf("cluster:[%s] error recording event %s, reason: %v", info.ClusterName, event, err) + } + }() +} diff --git a/tests/failover.go b/tests/failover.go index 20d7538ce4..ec9e45190c 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -89,6 +89,7 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon return errors.New("failed to get container status from tikv pod") } + oa.emitEvent(info, fmt.Sprintf("TruncateSSTFile: tikv: %s", store.PodName)) // restart tikv to ensure sst files err = tikvOps.KillProcess(info.Namespace, store.PodName, "tikv", 1, syscall.SIGTERM) if err != nil { diff --git a/tests/fault.go b/tests/fault.go index d1b2862507..9857c899bd 100644 --- a/tests/fault.go +++ b/tests/fault.go @@ -165,8 +165,6 @@ func (fa *faultTriggerActions) StartNode(physicalNode string, node string) error return err } - glog.Infof("%+v", vms) - for _, vm := range vms { if vm.IP == node && vm.Status == "running" { return nil diff --git a/tests/pkg/metrics/annotation_util.go b/tests/pkg/metrics/annotation_util.go index 661ffcc121..b45d801a88 100644 --- a/tests/pkg/metrics/annotation_util.go +++ b/tests/pkg/metrics/annotation_util.go @@ -28,7 +28,7 @@ import ( ) //Client request grafana API on a set of resource paths. -type client struct { +type Client struct { // base is the root URL for all invocations of the client baseUrl url.URL client *http.Client @@ -52,7 +52,7 @@ type AnnotationOptions struct { //NewClient creats a new grafanaClient. This client performs rest functions //such as Get, Post on specified paths. -func NewClient(grafanaUrl string, userName string, password string, prometheusExporterPort int) (*client, error) { +func NewClient(grafanaUrl string, userName string, password string, prometheusExporterPort int) (*Client, error) { u, err := url.Parse(grafanaUrl) if err != nil { return nil, err @@ -60,7 +60,7 @@ func NewClient(grafanaUrl string, userName string, password string, prometheusEx initFunc(prometheusExporterPort) u.User = url.UserPassword(userName, password) - return &client{ + return &Client{ baseUrl: *u, client: &http.Client{}, }, nil @@ -110,7 +110,7 @@ func initErrorMetric() prometheus.Counter { //IncreErrorCountWithAnno increments the errorcount by 1, //and add the annotation to grafanan. -func (cli *client) AddAnnotation(annotation Annotation) error { +func (cli *Client) AddAnnotation(annotation Annotation) error { body, err := annotation.getBody() if err != nil { return fmt.Errorf("create request body faield, %v", err) @@ -136,11 +136,11 @@ func (cli *client) AddAnnotation(annotation Annotation) error { return nil } -func (cli *client) IncrErrorCount() { +func (cli *Client) IncrErrorCount() { counterMetric.Inc() } -func (cli *client) getAnnotationPath() string { +func (cli *Client) getAnnotationPath() string { u := cli.baseUrl u.Path = path.Join(cli.baseUrl.Path, annotationSubPath) return u.String() From ada75fdfad177af7bd43c898914d7312f4f59491 Mon Sep 17 00:00:00 2001 From: Aylei Date: Wed, 17 Apr 2019 16:42:15 +0800 Subject: [PATCH 12/14] Check spcific store status in TruncateTikVSST case Signed-off-by: Aylei --- tests/failover.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/failover.go b/tests/failover.go index ec9e45190c..6e2eb2f84e 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -44,7 +44,6 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon return cnt } - origFailures := len(tc.Status.TiKV.FailureStores) origUps := countUpStores(tc) // checkout pd config @@ -135,8 +134,9 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon return tikvOps.PollTiDBCluster(info.Namespace, info.ClusterName, func(tc *v1alpha1.TidbCluster, err error) (bool, error) { - glog.Infof("check failure stores: current=%d origin=%d", len(tc.Status.TiKV.FailureStores), origFailures) - if len(tc.Status.TiKV.FailureStores) <= origFailures { + _, ok := tc.Status.TiKV.FailureStores[store.ID] + glog.Infof("check if target store failed: %t", ok) + if !ok { return false, nil } ups := countUpStores(tc) From c1c0787550dfe5b520d3145a9090d3c824bb6f7d Mon Sep 17 00:00:00 2001 From: Aylei Date: Mon, 22 Apr 2019 17:05:58 +0800 Subject: [PATCH 13/14] Move grafana client to per tidb cluster context Signed-off-by: Aylei --- tests/actions.go | 17 ++++++++--------- tests/pkg/metrics/annotation_util.go | 18 ++++++++++++------ 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index a6b8cc1638..cdb23f8a0a 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -121,8 +121,6 @@ type operatorActions struct { kubeCli kubernetes.Interface pdControl controller.PDControlInterface cfg *Config - - grafanaClient *metrics.Client } var _ = OperatorActions(&operatorActions{}) @@ -159,6 +157,7 @@ type TidbClusterConfig struct { BackupSecretName string BlockWriteConfig blockwriter.Config + GrafanaClient *metrics.Client } func (tc *TidbClusterConfig) BackupHelmSetString(m map[string]string) string { @@ -1329,14 +1328,14 @@ func (oa *operatorActions) checkGrafanaData(clusterInfo *TidbClusterConfig) erro return fmt.Errorf("invalid response: status: %s, result: %v", data.Status, data.Data.Result) } - // Grafana ready, init grafana client - if oa.grafanaClient == nil { + // Grafana ready, init grafana client, no more sync logic because race condition is okay here + if clusterInfo.GrafanaClient == nil { grafanaUrl := fmt.Sprintf("http://%s.%s:3000", svcName, ns) client, err := metrics.NewClient(grafanaUrl, grafanaUsername, grafanaPassword, metricsPort) if err != nil { return err } - oa.grafanaClient = client + clusterInfo.GrafanaClient = client } return nil } @@ -1934,7 +1933,7 @@ func (oa *operatorActions) drainerHealth(info *TidbClusterConfig, hostName strin } func (oa *operatorActions) emitEvent(info *TidbClusterConfig, event string) { - if oa.grafanaClient == nil { + if info.GrafanaClient == nil { glog.V(4).Infof("cluster:[%s] grafana client not ready, skip recording event %s.", info.ClusterName, event) return @@ -1950,9 +1949,9 @@ func (oa *operatorActions) emitEvent(info *TidbClusterConfig, event string) { fmt.Sprintf("ns-%s", info.Namespace), }, } - go func() { - if err := oa.grafanaClient.AddAnnotation(anno); err != nil { + go func(anno metrics.Annotation) { + if err := info.GrafanaClient.AddAnnotation(anno); err != nil { glog.Errorf("cluster:[%s] error recording event %s, reason: %v", info.ClusterName, event, err) } - }() + }(anno) } diff --git a/tests/pkg/metrics/annotation_util.go b/tests/pkg/metrics/annotation_util.go index b45d801a88..d7ea217beb 100644 --- a/tests/pkg/metrics/annotation_util.go +++ b/tests/pkg/metrics/annotation_util.go @@ -19,6 +19,7 @@ import ( "fmt" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + "io/ioutil" "net" "net/http" "net/url" @@ -44,10 +45,10 @@ type Annotation struct { //AnnotationOptions is the query options to a standard REST list call. type AnnotationOptions struct { - DashboardId int `json:"dashboardId, omitempty"` - PanelId int `json:"panelId, omitempty"` - IsRegin bool `json:"isRegion, omitempty"` - TimeEnd int64 `json:"timeEnd, omitempty"` + DashboardId int `json:"dashboardId,omitempty"` + PanelId int `json:"panelId,omitempty"` + IsRegin bool `json:"isRegion,omitempty"` + TimeEnd int64 `json:"timeEnd,omitempty"` } //NewClient creats a new grafanaClient. This client performs rest functions @@ -123,8 +124,8 @@ func (cli *Client) AddAnnotation(annotation Annotation) error { req.Header.Add("Accept", "application/json, text/plain, */*") req.Header.Add("Content-Type", "application/json;charset=UTF-8") - resp, error := cli.client.Do(req) - if error != nil { + resp, err := cli.client.Do(req) + if err != nil { return fmt.Errorf("add annotation faield, %v", err) } defer resp.Body.Close() @@ -132,6 +133,11 @@ func (cli *Client) AddAnnotation(annotation Annotation) error { if resp.StatusCode != http.StatusOK { return fmt.Errorf("add annotation faield, statusCode=%v", resp.Status) } + all, err := ioutil.ReadAll(resp.Body) + if err != nil { + return err + } + fmt.Println(all) return nil } From 54d0a23c868830b6afdd9c83fa569851ff25acf7 Mon Sep 17 00:00:00 2001 From: Aylei Date: Mon, 22 Apr 2019 17:35:37 +0800 Subject: [PATCH 14/14] Annotation events documentation Signed-off-by: Aylei --- docs/stability-test-cookbook.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/stability-test-cookbook.md b/docs/stability-test-cookbook.md index 612dfa10b4..3b4b4133aa 100644 --- a/docs/stability-test-cookbook.md +++ b/docs/stability-test-cookbook.md @@ -2,6 +2,8 @@ > Important notes: this guide is under heavy development and have complicated enviroment pre-requesites, things are ought to change in the future. +## Run stability test + The following commands assumes you are in the `tidb-operator` working directory: ```shell # image will be tagged as YOUR_DOCKER_REGISTRY/pingcap/tidb-operator-stability-test:latest @@ -14,6 +16,35 @@ $ vi ./tests/manifests/stability/stability.yaml $ kubectl apply -f ./tests/manifests/stability/stability.yaml ``` +## Get test report + +```shell +$ kubectl -n tidb-operator-stability logs tidb-operator-stability +``` + +## Inspect overall cluster stats under various operations + +It is useful to inspect how the cluster performs under various kind of operations or faults, you can access such information from the Grafana dashboard of each cluster: + +```shell +$ kubectl port-forward -n ${CLUSTER_NAMESPACE} svc/${CLUSTER_GRAFANA_SERVICE} 3000:3000 +``` + +Navigate to [localhost:3000](http://localhost:3000) to view the dashboards. + +Optionally, you can view the event annotations like `scale cluster`, `upgrade cluster`, `vm crash` by querying annotations in Grafana to get better understanding of the system, follow this step-by-step guide: + +1. click "Dashboard Setting" in the navigate bar +2. click the big "Make Editable" button +3. click "Annotations" in the sidebar +4. click "Add Annotation Query" +5. enter a name you like +6. switch "Match Any" on +7. add "stability" tag +8. click "add" +9. go back to dashboard and you will see the annotations trigger and the cluster events + + ## Alternative: run stability test in your local environment Deploy & witness flow can be tedious when developing stability-test, this document introduce that how to run stability-test out of the cluster(your local machine, usually) while still operating the remote cluster.