diff --git a/hack/e2e.sh b/hack/e2e.sh index 30cc4af4aa..12d617113e 100755 --- a/hack/e2e.sh +++ b/hack/e2e.sh @@ -73,6 +73,7 @@ Environments: AWS_ACCESS_KEY_ID (eks only) the aws access key id AWS_SECRET_ACCESS_KEY (eks only) the aws secret access key AWS_REGION (eks only) the aws region + AWS_ZONE (eks only) the aws zone GINKGO_NODES ginkgo nodes to run specs, defaults: 1 GINKGO_PARALLEL if set to `y`, will run specs in parallel, the number of nodes will be the number of cpus GINKGO_NO_COLOR if set to `y`, suppress color output in default reporter @@ -197,6 +198,7 @@ GCP_MACHINE_TYPE=${GCP_MACHINE_TYPE:-n1-standard-4} AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-} AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-} AWS_REGION=${AWS_REGION:-} +AWS_ZONE=${AWS_ZONE:-} KUBE_VERSION=${KUBE_VERSION:-v1.12.10} KUBE_WORKERS=${KUBE_WORKERS:-3} DOCKER_IO_MIRROR=${DOCKER_IO_MIRROR:-} @@ -223,6 +225,7 @@ echo "GCP_ZONE: $GCP_ZONE" # echo "AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID" # echo "AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY" echo "AWS_REGION: $AWS_REGION" +echo "AWS_ZONE: $AWS_ZONE" echo "KUBE_VERSION: $KUBE_VERSION" echo "KUBE_WORKERS: $KUBE_WORKERS" echo "DOCKER_IO_MIRROR: $DOCKER_IO_MIRROR" @@ -465,10 +468,14 @@ EOF ) fi elif [ "$PROVIDER" == "eks" ]; then + export KUBE_SSH_USER=ec2-user hack::ensure_aws_k8s_tester if [ -n "$AWS_REGION" ]; then aws configure set default.region "$AWS_REGION" fi + if [ -z "$AWS_ZONE" ]; then + AWS_ZONE=${AWS_REGION}a + fi if [ -n "$AWS_ACCESS_KEY_ID" ]; then aws configure set aws_access_key_id "$AWS_ACCESS_KEY_ID" fi @@ -505,6 +512,10 @@ export GCP_PROJECT export GCP_REGION export GCP_ZONE export GCP_CREDENTIALS +export AWS_ACCESS_KEY_ID +export AWS_SECRET_ACCESS_KEY +export AWS_REGION +export AWS_ZONE export IMAGE_TAG export SKIP_GINKGO export SKIP_IMAGE_LOAD diff --git a/hack/run-e2e.sh b/hack/run-e2e.sh index 0258eb1e39..7542cba0ef 100755 --- a/hack/run-e2e.sh +++ b/hack/run-e2e.sh @@ -29,6 +29,7 @@ GCP_REGION=${GCP_REGION:-} GCP_ZONE=${GCP_ZONE:-} GCP_CREDENTIALS=${GCP_CREDENTIALS:-} GCP_SDK=${GCP_SDK:-/google-cloud-sdk} +KUBE_SSH_USER=${KUBE_SSH_USER:-vagrant} IMAGE_TAG=${IMAGE_TAG:-} SKIP_IMAGE_LOAD=${SKIP_IMAGE_LOAD:-} TIDB_OPERATOR_IMAGE=${TIDB_OPERATOR_IMAGE:-localhost:5000/pingcap/tidb-operator:latest} @@ -51,6 +52,7 @@ if [ -z "$KUBECONFIG" ]; then exit 1 fi +echo "KUBE_SSH_USER: $KUBE_SSH_USER" echo "TIDB_OPERATOR_IMAGE: $TIDB_OPERATOR_IMAGE" echo "TIDB_BACKUP_MANAGER_IMAGE: $TIDB_BACKUP_MANAGER_IMAGE" echo "E2E_IMAGE: $E2E_IMAGE" @@ -123,67 +125,15 @@ for ((i = 1; i <= 32; i++)) { EOF done elif [ "$PROVIDER" == "gke" ]; then - # disks are created under /mnt/stateful_partition directory - # https://cloud.google.com/container-optimized-os/docs/concepts/disks-and-filesystem - for n in $($KUBECTL_BIN --context "$KUBECONTEXT" get nodes -ojsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'); do - gcloud compute ssh e2e@$n --command 'sudo bash -c '"'"' -test -d /mnt/stateful_partition/disks || mkdir -p /mnt/stateful_partition/disks -df -h /mnt/stateful_partition/disks -test -d /mnt/disks || mkdir -p /mnt/disks -cd /mnt/disks -for ((i = 1; i <= 32; i++)) { - if [ ! -d vol$i ]; then - mkdir vol$i - fi - if ! mountpoint vol$i &>/dev/null; then - if [ ! -d /mnt/stateful_partition/disks/vol$i ]; then - mkdir /mnt/stateful_partition/disks/vol$i - fi - mount --bind /mnt/stateful_partition/disks/vol$i vol$i - fi -} -'"'" - done + echo "info: provider is $PROVIDER, skipped" elif [ "$PROVIDER" == "eks" ]; then - while IFS=$'\n' read -r line; do - read -r id dns <<< $line - echo "info: prepare disks on $dns" - ssh -T -o "StrictHostKeyChecking no" -i ~/.ssh/kube_aws_rsa ec2-user@$dns <<'EOF' -sudo bash -c ' -test -d /mnt/disks || mkdir -p /mnt/disks -df -h /mnt/disks -if mountpoint /mnt/disks &>/dev/null; then - echo "info: /mnt/disks is a mountpoint" -else - echo "info: /mnt/disks is not a mountpoint, creating local volumes on the rootfs" -fi -cd /mnt/disks -for ((i = 1; i <= 32; i++)) { - if [ ! -d vol$i ]; then - mkdir vol$i - fi - if ! mountpoint vol$i &>/dev/null; then - mount --bind vol$i vol$i - fi -} -echo "info: increase max open files for containers" -if ! grep -qF "OPTIONS" /etc/sysconfig/docker; then - echo 'OPTIONS="--default-ulimit nofile=1024000:1024000"' >> /etc/sysconfig/docker -fi -systemctl restart docker -' -EOF - done <<< "$(e2e::__eks_instances)" + echo "info: provider is $PROVIDER, skipped" fi echo "info: installing local-volume-provisioner" $KUBECTL_BIN --context $KUBECONTEXT apply -f ${ROOT}/manifests/local-dind/local-volume-provisioner.yaml e2e::__wait_for_ds kube-system local-volume-provisioner } -function e2e::__eks_instances() { - aws ec2 describe-instances --filter Name=tag:eks:cluster-name,Values=$CLUSTER --query 'Reservations[*].Instances[*].{InstanceId:InstanceId,PublicDnsName:PublicDnsName}' --output text -} - function e2e::__ecr_url() { local account_id=$(aws sts get-caller-identity --output text | awk '{print $1}') local region=$(aws configure get region) @@ -211,6 +161,13 @@ function e2e::setup_helm_server() { $HELM_BIN version } +# Used by non-kind providers to tag image with its id. This can force our e2e +# process to pull correct image even if IfNotPresent is used in an existing +# environment, e.g. testing in the same cluster. +function e2e::image_id_tag() { + docker image inspect -f '{{.Id}}' "$1" | cut -d ':' -f 2 | head -c 10 +} + function e2e::image_load() { local images=( $TIDB_OPERATOR_IMAGE @@ -226,9 +183,9 @@ function e2e::image_load() { elif [ "$PROVIDER" == "gke" ]; then unset DOCKER_CONFIG # We don't need this and it may be read-only and fail the command to fail gcloud auth configure-docker - GCP_TIDB_OPERATOR_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator:$CLUSTER-$IMAGE_TAG - GCP_TIDB_BACKUP_MANAGER_IMAGE=gcr.io/$GCP_PROJECT/tidb-backup-image:$CLUSTER-$IMAGE_TAG - GCP_E2E_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator-e2e:$CLUSTER-$IMAGE_TAG + GCP_TIDB_OPERATOR_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator:$CLUSTER-$(e2e::image_id_tag $TIDB_OPERATOR_IMAGE) + GCP_TIDB_BACKUP_MANAGER_IMAGE=gcr.io/$GCP_PROJECT/tidb-backup-image:$CLUSTER-$(e2e::image_id_tag $TIDB_BACKUP_MANAGER_IMAGE) + GCP_E2E_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator-e2e:$CLUSTER-$(e2e::image_id_tag $E2E_IMAGE) docker tag $TIDB_OPERATOR_IMAGE $GCP_TIDB_OPERATOR_IMAGE docker tag $E2E_IMAGE $GCP_E2E_IMAGE docker tag $TIDB_BACKUP_MANAGER_IMAGE $GCP_TIDB_BACKUP_MANAGER_IMAGE @@ -253,9 +210,9 @@ function e2e::image_load() { local ecrURL=$(e2e::__ecr_url) echo "info: logging in $ecrURL" aws ecr get-login-password | docker login --username AWS --password-stdin $ecrURL - AWS_TIDB_OPERATOR_IMAGE=$ecrURL/e2e/tidb-operator:$CLUSTER-$IMAGE_TAG - AWS_TIDB_BACKUP_MANAGER_IMAGE=$ecrURL/e2e/tidb-backup-manager:$CLUSTER-$IMAGE_TAG - AWS_E2E_IMAGE=$ecrURL/e2e/tidb-operator-e2e:$CLUSTER-$IMAGE_TAG + AWS_TIDB_OPERATOR_IMAGE=$ecrURL/e2e/tidb-operator:$CLUSTER-$(e2e::image_id_tag $TIDB_OPERATOR_IMAGE) + AWS_TIDB_BACKUP_MANAGER_IMAGE=$ecrURL/e2e/tidb-backup-manager:$CLUSTER-$(e2e::image_id_tag $TIDB_BACKUP_MANAGER_IMAGE) + AWS_E2E_IMAGE=$ecrURL/e2e/tidb-operator-e2e:$CLUSTER-$(e2e::image_id_tag $E2E_IMAGE) docker tag $TIDB_OPERATOR_IMAGE $AWS_TIDB_OPERATOR_IMAGE docker tag $TIDB_BACKUP_MANAGER_IMAGE $AWS_TIDB_BACKUP_MANAGER_IMAGE docker tag $E2E_IMAGE $AWS_E2E_IMAGE @@ -363,16 +320,19 @@ docker_args=( -v $KUBECONFIG:/etc/kubernetes/admin.conf:ro --env KUBECONFIG=/etc/kubernetes/admin.conf --env KUBECONTEXT=$KUBECONTEXT + --env KUBE_SSH_USER=$KUBE_SSH_USER ) if [ "$PROVIDER" == "eks" ]; then e2e_args+=( --provider=aws - --gce-zone="${AWS_REGION}" + --gce-zone="${AWS_ZONE}" # reuse gce-zone to configure aws zone ) - # aws credential is required to get token for EKS docker_args+=( + # aws credential is required to get token for EKS -v $HOME/.aws:/root/.aws + # ~/.ssh/kube_aws_rsa must be mounted into e2e container to run ssh + -v $HOME/.ssh/kube_aws_rsa:/root/.ssh/kube_aws_rsa ) elif [ "$PROVIDER" == "gke" ]; then e2e_args+=( @@ -393,6 +353,8 @@ elif [ "$PROVIDER" == "gke" ]; then fi docker_args+=( -v ${GCP_SDK}:/google-cloud-sdk + # ~/.ssh/google_compute_engine must be mounted into e2e container to run ssh + -v $HOME/.ssh/google_compute_engine:/root/.ssh/google_compute_engine ) else e2e_args+=( diff --git a/manifests/gke/local-ssd-provision/local-ssd-provision.yaml b/manifests/gke/local-ssd-provision/local-ssd-provision.yaml index 18229b223d..0798ba0413 100644 --- a/manifests/gke/local-ssd-provision/local-ssd-provision.yaml +++ b/manifests/gke/local-ssd-provision/local-ssd-provision.yaml @@ -12,6 +12,7 @@ metadata: name: local-provisioner-config namespace: kube-system data: + setPVOwnerRef: "true" nodeLabelsForPV: | - kubernetes.io/hostname storageClassMap: | diff --git a/manifests/local-dind/local-volume-provisioner.yaml b/manifests/local-dind/local-volume-provisioner.yaml index bd8129ac17..ffc3c28342 100644 --- a/manifests/local-dind/local-volume-provisioner.yaml +++ b/manifests/local-dind/local-volume-provisioner.yaml @@ -12,6 +12,7 @@ metadata: name: local-provisioner-config namespace: kube-system data: + setPVOwnerRef: "true" nodeLabelsForPV: | - kubernetes.io/hostname storageClassMap: | diff --git a/pkg/pdapi/pdapi.go b/pkg/pdapi/pdapi.go index 1a3788079e..ab9bdb32c8 100644 --- a/pkg/pdapi/pdapi.go +++ b/pkg/pdapi/pdapi.go @@ -155,6 +155,8 @@ type PDClient interface { SetStoreLabels(storeID uint64, labels map[string]string) (bool, error) // DeleteStore deletes a TiKV store from cluster DeleteStore(storeID uint64) error + // SetStoreState sets store to specified state. + SetStoreState(storeID uint64, state string) error // DeleteMember deletes a PD member from cluster DeleteMember(name string) error // DeleteMemberByID deletes a PD member from cluster @@ -403,6 +405,30 @@ func (pc *pdClient) DeleteStore(storeID uint64) error { return fmt.Errorf("failed to delete store %d: %v", storeID, string(body)) } +// SetStoreState sets store to specified state. +func (pc *pdClient) SetStoreState(storeID uint64, state string) error { + apiURL := fmt.Sprintf("%s/%s/%d/state?state=%s", pc.url, storePrefix, storeID, state) + req, err := http.NewRequest("POST", apiURL, nil) + if err != nil { + return err + } + res, err := pc.httpClient.Do(req) + if err != nil { + return err + } + defer httputil.DeferClose(res.Body) + + if res.StatusCode == http.StatusOK || res.StatusCode == http.StatusNotFound { + return nil + } + body, err := ioutil.ReadAll(res.Body) + if err != nil { + return err + } + + return fmt.Errorf("failed to delete store %d: %v", storeID, string(body)) +} + func (pc *pdClient) DeleteMemberByID(memberID uint64) error { var exist bool members, err := pc.GetMembers() @@ -666,6 +692,7 @@ const ( GetTombStoneStoresActionType ActionType = "GetTombStoneStores" GetStoreActionType ActionType = "GetStore" DeleteStoreActionType ActionType = "DeleteStore" + SetStoreStateActionType ActionType = "SetStoreState" DeleteMemberByIDActionType ActionType = "DeleteMemberByID" DeleteMemberActionType ActionType = "DeleteMember " SetStoreLabelsActionType ActionType = "SetStoreLabels" @@ -790,6 +817,15 @@ func (pc *FakePDClient) DeleteStore(id uint64) error { return nil } +func (pc *FakePDClient) SetStoreState(id uint64, state string) error { + if reaction, ok := pc.reactions[SetStoreStateActionType]; ok { + action := &Action{ID: id} + _, err := reaction(action) + return err + } + return nil +} + func (pc *FakePDClient) DeleteMemberByID(id uint64) error { if reaction, ok := pc.reactions[DeleteMemberByIDActionType]; ok { action := &Action{ID: id} diff --git a/tests/actions.go b/tests/actions.go index a5c797a6fa..1b33a81572 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -294,6 +294,7 @@ type OperatorConfig struct { ValidatingEnabled bool Cabundle string BackupImage string + AutoFailover *bool } type TidbClusterConfig struct { @@ -408,7 +409,6 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { set := map[string]string{ "operatorImage": oi.Image, "tidbBackupManagerImage": oi.BackupImage, - "controllerManager.autoFailover": "true", "scheduler.logLevel": "4", "testMode": strconv.FormatBool(oi.TestMode), "admissionWebhook.cabundle": oi.Cabundle, @@ -442,6 +442,9 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string { if oi.Enabled(features.AdvancedStatefulSet) { set["advancedStatefulset.create"] = "true" } + if oi.AutoFailover != nil { + set["controllerManager.autoFailover"] = strconv.FormatBool(*oi.AutoFailover) + } arr := make([]string, 0, len(set)) for k, v := range set { diff --git a/tests/e2e/e2e.go b/tests/e2e/e2e.go index 35831e8315..7df4fccf10 100644 --- a/tests/e2e/e2e.go +++ b/tests/e2e/e2e.go @@ -34,6 +34,7 @@ import ( "github.com/pingcap/tidb-operator/tests" e2econfig "github.com/pingcap/tidb-operator/tests/e2e/config" utilimage "github.com/pingcap/tidb-operator/tests/e2e/util/image" + utilnode "github.com/pingcap/tidb-operator/tests/e2e/util/node" v1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" apiextensionsclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" @@ -122,6 +123,14 @@ func setupSuite() { e2elog.Logf("WARNING: Waiting for all daemonsets to be ready failed: %v", err) } + ginkgo.By("Initializing all nodes") + nodeList, err := c.CoreV1().Nodes().List(metav1.ListOptions{}) + framework.ExpectNoError(err) + for _, node := range nodeList.Items { + framework.Logf("Initializing node %q", node.Name) + framework.ExpectNoError(utilnode.InitNode(&node)) + } + // By using default storage class in GKE/EKS (aws), network attached storage // which be used and we must clean them later. // We set local-storage class as default for simplicity. diff --git a/tests/e2e/tidbcluster/stability.go b/tests/e2e/tidbcluster/stability.go index bb829a3830..d56659d85a 100644 --- a/tests/e2e/tidbcluster/stability.go +++ b/tests/e2e/tidbcluster/stability.go @@ -20,26 +20,35 @@ import ( "time" "github.com/onsi/ginkgo" + "github.com/onsi/gomega" asclientset "github.com/pingcap/advanced-statefulset/pkg/client/clientset/versioned" + "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" "github.com/pingcap/tidb-operator/pkg/label" "github.com/pingcap/tidb-operator/pkg/scheme" "github.com/pingcap/tidb-operator/tests" e2econfig "github.com/pingcap/tidb-operator/tests/e2e/config" utilimage "github.com/pingcap/tidb-operator/tests/e2e/util/image" + utilnode "github.com/pingcap/tidb-operator/tests/e2e/util/node" utilpod "github.com/pingcap/tidb-operator/tests/e2e/util/pod" "github.com/pingcap/tidb-operator/tests/e2e/util/portforward" + "github.com/pingcap/tidb-operator/tests/e2e/util/proxiedpdclient" utiltidb "github.com/pingcap/tidb-operator/tests/e2e/util/tidb" + utiltikv "github.com/pingcap/tidb-operator/tests/e2e/util/tikv" "github.com/pingcap/tidb-operator/tests/pkg/fixture" v1 "k8s.io/api/core/v1" apiextensionsclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" clientset "k8s.io/client-go/kubernetes" restclient "k8s.io/client-go/rest" aggregatorclient "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset" "k8s.io/kubernetes/test/e2e/framework" + e2enode "k8s.io/kubernetes/test/e2e/framework/node" + "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -89,7 +98,6 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() { } }) - // TODO generate more contexts for different operator values ginkgo.Context("operator with default values", func() { var ocfg *tests.OperatorConfig var oa tests.OperatorActions @@ -119,8 +127,6 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() { ginkgo.AfterEach(func() { ginkgo.By("Uninstall tidb-operator") oa.CleanOperatorOrDie(ocfg) - ginkgo.By("Uninstalling CRDs") - oa.CleanCRDOrDie() }) testCases := []struct { @@ -183,6 +189,329 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() { framework.ExpectEqual(err, wait.ErrWaitTimeout, "TiDB cluster is not affeteced") }) } + }) + + ginkgo.Context("operator with auto-failover disabled", func() { + var ocfg *tests.OperatorConfig + var oa tests.OperatorActions + var genericCli client.Client + + ginkgo.BeforeEach(func() { + ocfg = &tests.OperatorConfig{ + Namespace: ns, + ReleaseName: "operator", + Image: cfg.OperatorImage, + Tag: cfg.OperatorTag, + LogLevel: "4", + TestMode: true, + AutoFailover: pointer.BoolPtr(false), + } + oa = tests.NewOperatorActions(cli, c, asCli, aggrCli, apiExtCli, tests.DefaultPollInterval, ocfg, e2econfig.TestConfig, nil, fw, f) + ginkgo.By("Installing CRDs") + oa.CleanCRDOrDie() + oa.InstallCRDOrDie(ocfg) + ginkgo.By("Installing tidb-operator") + oa.CleanOperatorOrDie(ocfg) + oa.DeployOperatorOrDie(ocfg) + var err error + genericCli, err = client.New(config, client.Options{Scheme: scheme.Scheme}) + framework.ExpectNoError(err, "failed to create clientset") + }) + + ginkgo.AfterEach(func() { + ginkgo.By("Uninstall tidb-operator") + oa.CleanOperatorOrDie(ocfg) + }) + + // In this test, we demonstrate and verify the recover process when a + // node (and local storage on it) is permanently gone. + // + // In cloud, a node can be deleted manually or reclaimed by a + // controller (e.g. auto scaling group if ReplaceUnhealthy not + // suspended). Local storage on it will be permanently unaccessible. + // Manual intervention is required to recover from this situation. + // Basic steps will be: + // + // - for TiKV, delete associated store ID in PD + // - because we use network identity as store address, if we want to + // recover in place, we should delete the previous store at the same + // address. This requires us to set it to tombstone directly because + // the data is permanent lost, there is no way to delete it gracefully. + // - optionally, Advnaced StatefulSet can be used to recover with + // different network identity + // - for PD, like TiKV we must delete its member from the cluster + // - (EKS only) delete pvcs of failed pods + // - in EKS, failed pods on deleted node will be recreated because + // the node object is gone too (old pods is recycled by pod gc). But + // the newly created pods will be stuck at Pending state because + // associated PVs are invalid now. Pods will be recreated by + // tidb-operator again when we delete associated PVCs. New PVCs will + // be created by statefulset controller and pods will be scheduled to + // feasible nodes. + // - it's highly recommended to enable `setPVOwnerRef` in + // local-volume-provisioner, then orphan PVs will be garbaged + // collected and will not cause problem even if the name of deleted + // node is used again in the future. + // - (GKE only, fixed path) nothing need to do + // - Because the node name does not change, old PVs can be used. Note + // that `setPVOwnerRef` cannot be enabled because the node object + // could get deleted if it takes too long for the instance to + // recreate. + // - Optionally, you can deleted failed pods to make them to start + // soon. This is due to exponential crash loop back off. + // - (GKE only, unique paths) delete failed pods and associated PVCs/PVs + // - This is because even if the node name does not change, old PVs + // are invalid because unique volume paths are used. We must delete + // them all and wait for Kubernetes to rcreate and run again. + // - PVs must be deleted because the PVs are invalid and should not + // exist anymore. We can configure `setPVOwnerRef` to clean unused + // PVs when the node object is deleted, but the node object will not + // get deleted if the instance is recreated soon. + // + // Note that: + // - We assume local storage is used, otherwise PV can be re-attached + // the new node without problem. + // - PD and TiKV must have at least 3 replicas, otherwise one node + // deletion will cause permanent data loss and the cluster will be unrecoverable. + // - Of course, this process can be automated by implementing a + // controller integrated with cloud providers. It's outside the scope + // of tidb-operator now. + // - The same process can apply in bare-metal environment too when a + // machine or local storage is permanently gone. + // + // Differences between EKS and GKE: + // + // - In EKS, a new node object with different name will be created for + // the new machine. + // - In GKE (1.11+), the node object are no longer recreated on + // upgrade/repair even though the underlying instance is recreated and + // local disks are wiped. However, the node object could get deleted by + // cloud-controller-manager if it takes too long for the instance to + // recreate. + // + // Related issues: + // - https://github.com/pingcap/tidb-operator/issues/1546 + // - https://github.com/pingcap/tidb-operator/issues/408 + ginkgo.It("recover tidb cluster from node deletion", func() { + supportedProviders := sets.NewString("aws", "gke") + if !supportedProviders.Has(framework.TestContext.Provider) { + framework.Skipf("current provider is not supported list %v, skipping", supportedProviders.List()) + } + + ginkgo.By("Wait for all nodes are schedulable") + framework.ExpectNoError(framework.WaitForAllNodesSchedulable(c, framework.TestContext.NodeSchedulableTimeout)) + + ginkgo.By("Make sure we have at least 3 schedulable nodes") + nodeList := framework.GetReadySchedulableNodesOrDie(f.ClientSet) + gomega.Expect(len(nodeList.Items)).To(gomega.BeNumerically(">=", 3)) + + ginkgo.By("Deploy a test cluster with 3 pd and tikv replicas") + clusterName := "test" + tc := fixture.GetTidbCluster(ns, clusterName, utilimage.TiDBV3Version) + tc.Spec.PD.Replicas = 3 + tc.Spec.TiDB.Replicas = 1 + tc.Spec.TiKV.Replicas = 3 + err := genericCli.Create(context.TODO(), tc) + framework.ExpectNoError(err) + err = oa.WaitForTidbClusterReady(tc, 30*time.Minute, 15*time.Second) + framework.ExpectNoError(err) + + ginkgo.By("By using tidb-scheduler, 3 TiKV/PD replicas should be on different nodes") + allNodes := make(map[string]v1.Node) + for _, node := range nodeList.Items { + allNodes[node.Name] = node + } + allTiKVNodes := make(map[string]v1.Node) + allPDNodes := make(map[string]v1.Node) + listOptions := metav1.ListOptions{ + LabelSelector: labels.SelectorFromSet(label.New().Instance(clusterName).Labels()).String(), + } + podList, err := c.CoreV1().Pods(ns).List(listOptions) + framework.ExpectNoError(err) + for _, pod := range podList.Items { + if v, ok := pod.Labels[label.ComponentLabelKey]; !ok { + framework.Failf("pod %s/%s does not have component label key %q", pod.Namespace, pod.Name, label.ComponentLabelKey) + } else if v == label.PDLabelVal { + allPDNodes[pod.Name] = allNodes[pod.Spec.NodeName] + } else if v == label.TiKVLabelVal { + allTiKVNodes[pod.Name] = allNodes[pod.Spec.NodeName] + } else { + continue + } + } + gomega.Expect(len(allPDNodes)).To(gomega.BeNumerically("==", 3), "the number of pd nodes should be 3") + gomega.Expect(len(allTiKVNodes)).To(gomega.BeNumerically("==", 3), "the number of tikv nodes should be 3") + + ginkgo.By("Deleting a node") + var nodeToDelete *v1.Node + for _, node := range allTiKVNodes { + if nodeToDelete == nil { + nodeToDelete = &node + break + } + } + gomega.Expect(nodeToDelete).NotTo(gomega.BeNil()) + var pdPodsOnDeletedNode []v1.Pod + var tikvPodsOnDeletedNode []v1.Pod + var pvcNamesOnDeletedNode []string + for _, pod := range podList.Items { + if pod.Spec.NodeName == nodeToDelete.Name { + if v, ok := pod.Labels[label.ComponentLabelKey]; ok { + if v == label.PDLabelVal { + pdPodsOnDeletedNode = append(pdPodsOnDeletedNode, pod) + } else if v == label.TiKVLabelVal { + tikvPodsOnDeletedNode = append(tikvPodsOnDeletedNode, pod) + } + } + for _, volume := range pod.Spec.Volumes { + if volume.PersistentVolumeClaim != nil { + pvcNamesOnDeletedNode = append(pvcNamesOnDeletedNode, volume.PersistentVolumeClaim.ClaimName) + } + } + } + } + gomega.Expect(len(tikvPodsOnDeletedNode)).To(gomega.BeNumerically(">=", 1), "the number of affected tikvs must be equal or greater than 1") + err = framework.DeleteNodeOnCloudProvider(nodeToDelete) + framework.ExpectNoError(err, fmt.Sprintf("failed to delete node %q", nodeToDelete.Name)) + framework.Logf("Node %q deleted", nodeToDelete.Name) + + if framework.TestContext.Provider == "aws" { + // The node object will be gone with physical machine. + ginkgo.By(fmt.Sprintf("[AWS/EKS] Wait for the node object %q to be deleted", nodeToDelete.Name)) + err = wait.PollImmediate(time.Second*5, time.Minute*5, func() (bool, error) { + _, err = c.CoreV1().Nodes().Get(nodeToDelete.Name, metav1.GetOptions{}) + if err == nil || !apierrors.IsNotFound(err) { + return false, nil + } + return true, nil + }) + framework.ExpectNoError(err) + + ginkgo.By("[AWS/EKS] New instance will be created and join the cluster") + _, err := e2enode.CheckReady(c, len(nodeList.Items), 5*time.Minute) + framework.ExpectNoError(err) + + ginkgo.By("[AWS/EKS] Initialize newly created node") + nodeList, err = c.CoreV1().Nodes().List(metav1.ListOptions{}) + framework.ExpectNoError(err) + initialized := 0 + for _, node := range nodeList.Items { + if _, ok := allNodes[node.Name]; !ok { + framework.ExpectNoError(utilnode.InitNode(&node)) + initialized++ + } + } + gomega.Expect(initialized).To(gomega.BeNumerically("==", 1), "must have a node initialized") + } else if framework.TestContext.Provider == "gke" { + instanceIDAnn := "container.googleapis.com/instance_id" + oldInstanceID, ok := nodeToDelete.Annotations[instanceIDAnn] + if !ok { + framework.Failf("instance label %q not found on node object %q", instanceIDAnn, nodeToDelete.Name) + } + + ginkgo.By("[GCP/GKE] Wait for instance ID to be updated") + err = wait.PollImmediate(time.Second*5, time.Minute*10, func() (bool, error) { + node, err := c.CoreV1().Nodes().Get(nodeToDelete.Name, metav1.GetOptions{}) + if err != nil { + return false, nil + } + instanceID, ok := node.Annotations[instanceIDAnn] + if !ok { + return false, nil + } + if instanceID == oldInstanceID { + return false, nil + } + framework.Logf("instance ID of node %q changed from %q to %q", nodeToDelete.Name, oldInstanceID, instanceID) + return true, nil + }) + framework.ExpectNoError(err) + + ginkgo.By("[GCP/GKE] Wait for the node to be ready") + e2enode.WaitForNodeToBeReady(c, nodeToDelete.Name, time.Minute*5) + + ginkgo.By(fmt.Sprintf("[GCP/GKE] Initialize underlying machine of node %s", nodeToDelete.Name)) + node, err := c.CoreV1().Nodes().Get(nodeToDelete.Name, metav1.GetOptions{}) + framework.ExpectNoError(err) + framework.ExpectNoError(utilnode.InitNode(node)) + } + + ginkgo.By("Mark stores of failed tikv pods as tombstone") + pdClient, cancel, err := proxiedpdclient.NewProxiedPDClient(c, fw, ns, clusterName, false, nil) + framework.ExpectNoError(err) + defer func() { + if cancel != nil { + cancel() + } + }() + for _, pod := range tikvPodsOnDeletedNode { + framework.Logf("Mark tikv store of pod %s/%s as Tombstone", ns, pod.Name) + err = wait.PollImmediate(time.Second*3, time.Minute, func() (bool, error) { + storeID, err := utiltikv.GetStoreIDByPodName(cli, ns, clusterName, pod.Name) + if err != nil { + return false, nil + } + err = pdClient.SetStoreState(storeID, v1alpha1.TiKVStateTombstone) + if err != nil { + return false, nil + } + return true, nil + }) + framework.ExpectNoError(err) + } + ginkgo.By("Delete pd members") + for _, pod := range pdPodsOnDeletedNode { + framework.Logf("Delete pd member of pod %s/%s", ns, pod.Name) + err = wait.PollImmediate(time.Second*3, time.Minute, func() (bool, error) { + err = pdClient.DeleteMember(pod.Name) + if err != nil { + return false, nil + } + return true, nil + }) + framework.ExpectNoError(err) + } + cancel() + cancel = nil + + if framework.TestContext.Provider == "aws" { + // Local storage is gone with the node and local PVs on deleted + // node will be unusable. + // If `setPVOwnerRef` is enabled in local-volume-provisioner, + // local PVs will be deleted when the node object is deleted + // and permanently gone in apiserver when associated PVCs are + // delete here. + ginkgo.By("[AWS/EKS] Delete associated PVCs if they are bound with local PVs") + localPVs := make([]string, 0) + for _, pvcName := range pvcNamesOnDeletedNode { + pvc, err := c.CoreV1().PersistentVolumeClaims(ns).Get(pvcName, metav1.GetOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + framework.Failf("apiserver error: %v", err) + } + if apierrors.IsNotFound(err) { + continue + } + if pvc.Spec.StorageClassName != nil && *pvc.Spec.StorageClassName == "local-storage" { + localPVs = append(localPVs, pvc.Spec.VolumeName) + err = c.CoreV1().PersistentVolumeClaims(ns).Delete(pvc.Name, &metav1.DeleteOptions{}) + framework.ExpectNoError(err) + } + } + } else if framework.TestContext.Provider == "gke" { + framework.Logf("We are using fixed paths in local PVs in our e2e. PVs of the deleted node are usable though the underlying storage is empty now") + // Because of pod exponential crash loop back off, we can + // delete the failed pods to make it start soon. + // Note that this is optional. + ginkgo.By("Deleting the failed pods") + for _, pod := range append(tikvPodsOnDeletedNode, pdPodsOnDeletedNode...) { + framework.ExpectNoError(c.CoreV1().Pods(ns).Delete(pod.Name, &metav1.DeleteOptions{})) + } + } + + ginkgo.By("Waiting for tidb cluster to be fully ready") + err = oa.WaitForTidbClusterReady(tc, 5*time.Minute, 15*time.Second) + framework.ExpectNoError(err) + }) }) diff --git a/tests/e2e/util/node/node.go b/tests/e2e/util/node/node.go new file mode 100644 index 0000000000..b8a873706f --- /dev/null +++ b/tests/e2e/util/node/node.go @@ -0,0 +1,82 @@ +// Copyright 2020 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package node + +import ( + v1 "k8s.io/api/core/v1" + "k8s.io/kubernetes/test/e2e/framework" + "k8s.io/kubernetes/test/e2e/framework/ssh" +) + +var ( + awsNodeInitCmd = ` +sudo bash -c ' +test -d /mnt/disks || mkdir -p /mnt/disks +df -h /mnt/disks +if mountpoint /mnt/disks &>/dev/null; then + echo "info: /mnt/disks is a mountpoint" +else + echo "info: /mnt/disks is not a mountpoint, creating local volumes on the rootfs" +fi +cd /mnt/disks +for ((i = 1; i <= 32; i++)) { + if [ ! -d vol$i ]; then + mkdir vol$i + fi + if ! mountpoint vol$i &>/dev/null; then + mount --bind vol$i vol$i + fi +} +echo "info: increase max open files for containers" +if ! grep -qF "OPTIONS" /etc/sysconfig/docker; then + echo 'OPTIONS="--default-ulimit nofile=1024000:1024000"' >> /etc/sysconfig/docker +fi +systemctl restart docker +' +` + // disks are created under /mnt/stateful_partition directory + // https://cloud.google.com/container-optimized-os/docs/concepts/disks-and-filesystem + gkeNodeInitCmd = ` +sudo bash -c ' +test -d /mnt/stateful_partition/disks || mkdir -p /mnt/stateful_partition/disks +df -h /mnt/stateful_partition/disks +test -d /mnt/disks || mkdir -p /mnt/disks +cd /mnt/disks +for ((i = 1; i <= 32; i++)) { + if [ ! -d vol$i ]; then + mkdir vol$i + fi + if ! mountpoint vol$i &>/dev/null; then + if [ ! -d /mnt/stateful_partition/disks/vol$i ]; then + mkdir /mnt/stateful_partition/disks/vol$i + fi + mount --bind /mnt/stateful_partition/disks/vol$i vol$i + fi +} +' +` +) + +func InitNode(node *v1.Node) error { + var initNodeCmd string + if framework.TestContext.Provider == "aws" { + initNodeCmd = awsNodeInitCmd + } else if framework.TestContext.Provider == "gke" { + initNodeCmd = gkeNodeInitCmd + } else { + framework.Logf("Unknown provider %q, skipped", framework.TestContext.Provider) + return nil + } + return ssh.IssueSSHCommand(initNodeCmd, framework.TestContext.Provider, node) +} diff --git a/tests/e2e/util/tikv/tikv.go b/tests/e2e/util/tikv/tikv.go new file mode 100644 index 0000000000..1ecbf0d6db --- /dev/null +++ b/tests/e2e/util/tikv/tikv.go @@ -0,0 +1,35 @@ +// Copyright 2020 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package tikv + +import ( + "fmt" + "strconv" + + "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func GetStoreIDByPodName(c versioned.Interface, ns, clusterName, podName string) (uint64, error) { + tc, err := c.PingcapV1alpha1().TidbClusters(ns).Get(clusterName, metav1.GetOptions{}) + if err != nil { + return 0, err + } + for _, store := range tc.Status.TiKV.Stores { + if store.PodName == podName { + return strconv.ParseUint(store.ID, 10, 64) + } + } + return 0, fmt.Errorf("tikv store of pod %q not found in cluster %s/%s", podName, ns, clusterName) +}