diff --git a/hack/e2e.sh b/hack/e2e.sh
index 30cc4af4aa..12d617113e 100755
--- a/hack/e2e.sh
+++ b/hack/e2e.sh
@@ -73,6 +73,7 @@ Environments:
     AWS_ACCESS_KEY_ID     (eks only) the aws access key id
     AWS_SECRET_ACCESS_KEY (eks only) the aws secret access key
     AWS_REGION            (eks only) the aws region
+    AWS_ZONE              (eks only) the aws zone
     GINKGO_NODES          ginkgo nodes to run specs, defaults: 1
     GINKGO_PARALLEL       if set to `y`, will run specs in parallel, the number of nodes will be the number of cpus
     GINKGO_NO_COLOR       if set to `y`, suppress color output in default reporter
@@ -197,6 +198,7 @@ GCP_MACHINE_TYPE=${GCP_MACHINE_TYPE:-n1-standard-4}
 AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-}
 AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-}
 AWS_REGION=${AWS_REGION:-}
+AWS_ZONE=${AWS_ZONE:-}
 KUBE_VERSION=${KUBE_VERSION:-v1.12.10}
 KUBE_WORKERS=${KUBE_WORKERS:-3}
 DOCKER_IO_MIRROR=${DOCKER_IO_MIRROR:-}
@@ -223,6 +225,7 @@ echo "GCP_ZONE: $GCP_ZONE"
 # echo "AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID"
 # echo "AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY"
 echo "AWS_REGION: $AWS_REGION"
+echo "AWS_ZONE: $AWS_ZONE"
 echo "KUBE_VERSION: $KUBE_VERSION"
 echo "KUBE_WORKERS: $KUBE_WORKERS"
 echo "DOCKER_IO_MIRROR: $DOCKER_IO_MIRROR"
@@ -465,10 +468,14 @@ EOF
         )
     fi
 elif [ "$PROVIDER" == "eks" ]; then
+    export KUBE_SSH_USER=ec2-user
     hack::ensure_aws_k8s_tester
     if [ -n "$AWS_REGION" ]; then
         aws configure set default.region "$AWS_REGION"
     fi
+    if [ -z "$AWS_ZONE" ]; then
+        AWS_ZONE=${AWS_REGION}a
+    fi
     if [ -n "$AWS_ACCESS_KEY_ID" ]; then
         aws configure set aws_access_key_id "$AWS_ACCESS_KEY_ID"
     fi
@@ -505,6 +512,10 @@ export GCP_PROJECT
 export GCP_REGION
 export GCP_ZONE
 export GCP_CREDENTIALS
+export AWS_ACCESS_KEY_ID
+export AWS_SECRET_ACCESS_KEY
+export AWS_REGION
+export AWS_ZONE
 export IMAGE_TAG
 export SKIP_GINKGO
 export SKIP_IMAGE_LOAD
diff --git a/hack/run-e2e.sh b/hack/run-e2e.sh
index 0258eb1e39..7542cba0ef 100755
--- a/hack/run-e2e.sh
+++ b/hack/run-e2e.sh
@@ -29,6 +29,7 @@ GCP_REGION=${GCP_REGION:-}
 GCP_ZONE=${GCP_ZONE:-}
 GCP_CREDENTIALS=${GCP_CREDENTIALS:-}
 GCP_SDK=${GCP_SDK:-/google-cloud-sdk}
+KUBE_SSH_USER=${KUBE_SSH_USER:-vagrant}
 IMAGE_TAG=${IMAGE_TAG:-}
 SKIP_IMAGE_LOAD=${SKIP_IMAGE_LOAD:-}
 TIDB_OPERATOR_IMAGE=${TIDB_OPERATOR_IMAGE:-localhost:5000/pingcap/tidb-operator:latest}
@@ -51,6 +52,7 @@ if [ -z "$KUBECONFIG" ]; then
     exit 1
 fi
 
+echo "KUBE_SSH_USER: $KUBE_SSH_USER"
 echo "TIDB_OPERATOR_IMAGE: $TIDB_OPERATOR_IMAGE"
 echo "TIDB_BACKUP_MANAGER_IMAGE: $TIDB_BACKUP_MANAGER_IMAGE"
 echo "E2E_IMAGE: $E2E_IMAGE"
@@ -123,67 +125,15 @@ for ((i = 1; i <= 32; i++)) {
 EOF
         done
     elif [ "$PROVIDER" == "gke" ]; then
-        # disks are created under /mnt/stateful_partition directory
-        # https://cloud.google.com/container-optimized-os/docs/concepts/disks-and-filesystem
-        for n in $($KUBECTL_BIN --context "$KUBECONTEXT" get nodes -ojsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'); do
-            gcloud compute ssh e2e@$n --command 'sudo bash -c '"'"'
-test -d /mnt/stateful_partition/disks || mkdir -p /mnt/stateful_partition/disks
-df -h /mnt/stateful_partition/disks
-test -d /mnt/disks || mkdir -p /mnt/disks
-cd /mnt/disks
-for ((i = 1; i <= 32; i++)) {
-    if [ ! -d vol$i ]; then
-        mkdir vol$i
-    fi
-    if ! mountpoint vol$i &>/dev/null; then
-        if [ ! -d /mnt/stateful_partition/disks/vol$i ]; then
-            mkdir /mnt/stateful_partition/disks/vol$i
-        fi
-        mount --bind /mnt/stateful_partition/disks/vol$i vol$i
-    fi
-}
-'"'"
-        done
+        echo "info: provider is $PROVIDER, skipped"
     elif [ "$PROVIDER" == "eks" ]; then
-        while IFS=$'\n' read -r line; do
-            read -r id dns <<< $line
-            echo "info: prepare disks on $dns"
-            ssh -T -o "StrictHostKeyChecking no" -i ~/.ssh/kube_aws_rsa ec2-user@$dns <<'EOF'
-sudo bash -c '
-test -d /mnt/disks || mkdir -p /mnt/disks
-df -h /mnt/disks
-if mountpoint /mnt/disks &>/dev/null; then
-    echo "info: /mnt/disks is a mountpoint"
-else
-    echo "info: /mnt/disks is not a mountpoint, creating local volumes on the rootfs"
-fi
-cd /mnt/disks
-for ((i = 1; i <= 32; i++)) {
-    if [ ! -d vol$i ]; then
-        mkdir vol$i
-    fi
-    if ! mountpoint vol$i &>/dev/null; then
-        mount --bind vol$i vol$i
-    fi
-}
-echo "info: increase max open files for containers"
-if ! grep -qF "OPTIONS" /etc/sysconfig/docker; then
-    echo 'OPTIONS="--default-ulimit nofile=1024000:1024000"' >> /etc/sysconfig/docker
-fi
-systemctl restart docker
-'
-EOF
-        done <<< "$(e2e::__eks_instances)"
+        echo "info: provider is $PROVIDER, skipped"
     fi
     echo "info: installing local-volume-provisioner"
     $KUBECTL_BIN --context $KUBECONTEXT apply -f ${ROOT}/manifests/local-dind/local-volume-provisioner.yaml
     e2e::__wait_for_ds kube-system local-volume-provisioner
 }
 
-function e2e::__eks_instances() {
-    aws ec2 describe-instances --filter Name=tag:eks:cluster-name,Values=$CLUSTER --query 'Reservations[*].Instances[*].{InstanceId:InstanceId,PublicDnsName:PublicDnsName}' --output text
-}
-
 function e2e::__ecr_url() {
     local account_id=$(aws sts get-caller-identity --output text | awk '{print $1}')
     local region=$(aws configure get region)
@@ -211,6 +161,13 @@ function e2e::setup_helm_server() {
     $HELM_BIN version
 }
 
+# Used by non-kind providers to tag image with its id. This can force our e2e
+# process to pull correct image even if IfNotPresent is used in an existing
+# environment, e.g. testing in the same cluster.
+function e2e::image_id_tag() {
+    docker image inspect -f '{{.Id}}' "$1" | cut -d ':' -f 2 | head -c 10
+}
+
 function e2e::image_load() {
     local images=(
         $TIDB_OPERATOR_IMAGE
@@ -226,9 +183,9 @@ function e2e::image_load() {
     elif [ "$PROVIDER" == "gke" ]; then
         unset DOCKER_CONFIG # We don't need this and it may be read-only and fail the command to fail
         gcloud auth configure-docker
-        GCP_TIDB_OPERATOR_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator:$CLUSTER-$IMAGE_TAG
-        GCP_TIDB_BACKUP_MANAGER_IMAGE=gcr.io/$GCP_PROJECT/tidb-backup-image:$CLUSTER-$IMAGE_TAG
-        GCP_E2E_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator-e2e:$CLUSTER-$IMAGE_TAG
+        GCP_TIDB_OPERATOR_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator:$CLUSTER-$(e2e::image_id_tag $TIDB_OPERATOR_IMAGE)
+        GCP_TIDB_BACKUP_MANAGER_IMAGE=gcr.io/$GCP_PROJECT/tidb-backup-image:$CLUSTER-$(e2e::image_id_tag $TIDB_BACKUP_MANAGER_IMAGE)
+        GCP_E2E_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator-e2e:$CLUSTER-$(e2e::image_id_tag $E2E_IMAGE)
         docker tag $TIDB_OPERATOR_IMAGE $GCP_TIDB_OPERATOR_IMAGE
         docker tag $E2E_IMAGE $GCP_E2E_IMAGE
         docker tag $TIDB_BACKUP_MANAGER_IMAGE $GCP_TIDB_BACKUP_MANAGER_IMAGE
@@ -253,9 +210,9 @@ function e2e::image_load() {
         local ecrURL=$(e2e::__ecr_url)
         echo "info: logging in $ecrURL"
         aws ecr get-login-password | docker login --username AWS --password-stdin $ecrURL
-        AWS_TIDB_OPERATOR_IMAGE=$ecrURL/e2e/tidb-operator:$CLUSTER-$IMAGE_TAG
-        AWS_TIDB_BACKUP_MANAGER_IMAGE=$ecrURL/e2e/tidb-backup-manager:$CLUSTER-$IMAGE_TAG
-        AWS_E2E_IMAGE=$ecrURL/e2e/tidb-operator-e2e:$CLUSTER-$IMAGE_TAG
+        AWS_TIDB_OPERATOR_IMAGE=$ecrURL/e2e/tidb-operator:$CLUSTER-$(e2e::image_id_tag $TIDB_OPERATOR_IMAGE)
+        AWS_TIDB_BACKUP_MANAGER_IMAGE=$ecrURL/e2e/tidb-backup-manager:$CLUSTER-$(e2e::image_id_tag $TIDB_BACKUP_MANAGER_IMAGE)
+        AWS_E2E_IMAGE=$ecrURL/e2e/tidb-operator-e2e:$CLUSTER-$(e2e::image_id_tag $E2E_IMAGE)
         docker tag $TIDB_OPERATOR_IMAGE $AWS_TIDB_OPERATOR_IMAGE
         docker tag $TIDB_BACKUP_MANAGER_IMAGE $AWS_TIDB_BACKUP_MANAGER_IMAGE
         docker tag $E2E_IMAGE $AWS_E2E_IMAGE
@@ -363,16 +320,19 @@ docker_args=(
     -v $KUBECONFIG:/etc/kubernetes/admin.conf:ro
     --env KUBECONFIG=/etc/kubernetes/admin.conf
     --env KUBECONTEXT=$KUBECONTEXT
+    --env KUBE_SSH_USER=$KUBE_SSH_USER
 )
 
 if [ "$PROVIDER" == "eks" ]; then
     e2e_args+=(
         --provider=aws
-        --gce-zone="${AWS_REGION}"
+        --gce-zone="${AWS_ZONE}" # reuse gce-zone to configure aws zone
     )
-    # aws credential is required to get token for EKS
     docker_args+=(
+        # aws credential is required to get token for EKS
         -v $HOME/.aws:/root/.aws
+        # ~/.ssh/kube_aws_rsa must be mounted into e2e container to run ssh
+        -v $HOME/.ssh/kube_aws_rsa:/root/.ssh/kube_aws_rsa
     )
 elif [ "$PROVIDER" == "gke" ]; then
     e2e_args+=(
@@ -393,6 +353,8 @@ elif [ "$PROVIDER" == "gke" ]; then
     fi
     docker_args+=(
         -v ${GCP_SDK}:/google-cloud-sdk
+        # ~/.ssh/google_compute_engine must be mounted into e2e container to run ssh
+        -v $HOME/.ssh/google_compute_engine:/root/.ssh/google_compute_engine
     )
 else
     e2e_args+=(
diff --git a/manifests/gke/local-ssd-provision/local-ssd-provision.yaml b/manifests/gke/local-ssd-provision/local-ssd-provision.yaml
index 18229b223d..0798ba0413 100644
--- a/manifests/gke/local-ssd-provision/local-ssd-provision.yaml
+++ b/manifests/gke/local-ssd-provision/local-ssd-provision.yaml
@@ -12,6 +12,7 @@ metadata:
   name: local-provisioner-config
   namespace: kube-system
 data:
+  setPVOwnerRef: "true"
   nodeLabelsForPV: |
     - kubernetes.io/hostname
   storageClassMap: |
diff --git a/manifests/local-dind/local-volume-provisioner.yaml b/manifests/local-dind/local-volume-provisioner.yaml
index bd8129ac17..ffc3c28342 100644
--- a/manifests/local-dind/local-volume-provisioner.yaml
+++ b/manifests/local-dind/local-volume-provisioner.yaml
@@ -12,6 +12,7 @@ metadata:
   name: local-provisioner-config
   namespace: kube-system
 data:
+  setPVOwnerRef: "true"
   nodeLabelsForPV: |
     - kubernetes.io/hostname
   storageClassMap: |
diff --git a/pkg/pdapi/pdapi.go b/pkg/pdapi/pdapi.go
index 1a3788079e..ab9bdb32c8 100644
--- a/pkg/pdapi/pdapi.go
+++ b/pkg/pdapi/pdapi.go
@@ -155,6 +155,8 @@ type PDClient interface {
 	SetStoreLabels(storeID uint64, labels map[string]string) (bool, error)
 	// DeleteStore deletes a TiKV store from cluster
 	DeleteStore(storeID uint64) error
+	// SetStoreState sets store to specified state.
+	SetStoreState(storeID uint64, state string) error
 	// DeleteMember deletes a PD member from cluster
 	DeleteMember(name string) error
 	// DeleteMemberByID deletes a PD member from cluster
@@ -403,6 +405,30 @@ func (pc *pdClient) DeleteStore(storeID uint64) error {
 	return fmt.Errorf("failed to delete store %d: %v", storeID, string(body))
 }
 
+// SetStoreState sets store to specified state.
+func (pc *pdClient) SetStoreState(storeID uint64, state string) error {
+	apiURL := fmt.Sprintf("%s/%s/%d/state?state=%s", pc.url, storePrefix, storeID, state)
+	req, err := http.NewRequest("POST", apiURL, nil)
+	if err != nil {
+		return err
+	}
+	res, err := pc.httpClient.Do(req)
+	if err != nil {
+		return err
+	}
+	defer httputil.DeferClose(res.Body)
+
+	if res.StatusCode == http.StatusOK || res.StatusCode == http.StatusNotFound {
+		return nil
+	}
+	body, err := ioutil.ReadAll(res.Body)
+	if err != nil {
+		return err
+	}
+
+	return fmt.Errorf("failed to delete store %d: %v", storeID, string(body))
+}
+
 func (pc *pdClient) DeleteMemberByID(memberID uint64) error {
 	var exist bool
 	members, err := pc.GetMembers()
@@ -666,6 +692,7 @@ const (
 	GetTombStoneStoresActionType       ActionType = "GetTombStoneStores"
 	GetStoreActionType                 ActionType = "GetStore"
 	DeleteStoreActionType              ActionType = "DeleteStore"
+	SetStoreStateActionType            ActionType = "SetStoreState"
 	DeleteMemberByIDActionType         ActionType = "DeleteMemberByID"
 	DeleteMemberActionType             ActionType = "DeleteMember "
 	SetStoreLabelsActionType           ActionType = "SetStoreLabels"
@@ -790,6 +817,15 @@ func (pc *FakePDClient) DeleteStore(id uint64) error {
 	return nil
 }
 
+func (pc *FakePDClient) SetStoreState(id uint64, state string) error {
+	if reaction, ok := pc.reactions[SetStoreStateActionType]; ok {
+		action := &Action{ID: id}
+		_, err := reaction(action)
+		return err
+	}
+	return nil
+}
+
 func (pc *FakePDClient) DeleteMemberByID(id uint64) error {
 	if reaction, ok := pc.reactions[DeleteMemberByIDActionType]; ok {
 		action := &Action{ID: id}
diff --git a/tests/actions.go b/tests/actions.go
index a5c797a6fa..1b33a81572 100644
--- a/tests/actions.go
+++ b/tests/actions.go
@@ -294,6 +294,7 @@ type OperatorConfig struct {
 	ValidatingEnabled         bool
 	Cabundle                  string
 	BackupImage               string
+	AutoFailover              *bool
 }
 
 type TidbClusterConfig struct {
@@ -408,7 +409,6 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string {
 	set := map[string]string{
 		"operatorImage":                                oi.Image,
 		"tidbBackupManagerImage":                       oi.BackupImage,
-		"controllerManager.autoFailover":               "true",
 		"scheduler.logLevel":                           "4",
 		"testMode":                                     strconv.FormatBool(oi.TestMode),
 		"admissionWebhook.cabundle":                    oi.Cabundle,
@@ -442,6 +442,9 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string {
 	if oi.Enabled(features.AdvancedStatefulSet) {
 		set["advancedStatefulset.create"] = "true"
 	}
+	if oi.AutoFailover != nil {
+		set["controllerManager.autoFailover"] = strconv.FormatBool(*oi.AutoFailover)
+	}
 
 	arr := make([]string, 0, len(set))
 	for k, v := range set {
diff --git a/tests/e2e/e2e.go b/tests/e2e/e2e.go
index 35831e8315..7df4fccf10 100644
--- a/tests/e2e/e2e.go
+++ b/tests/e2e/e2e.go
@@ -34,6 +34,7 @@ import (
 	"github.com/pingcap/tidb-operator/tests"
 	e2econfig "github.com/pingcap/tidb-operator/tests/e2e/config"
 	utilimage "github.com/pingcap/tidb-operator/tests/e2e/util/image"
+	utilnode "github.com/pingcap/tidb-operator/tests/e2e/util/node"
 	v1 "k8s.io/api/core/v1"
 	storagev1 "k8s.io/api/storage/v1"
 	apiextensionsclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
@@ -122,6 +123,14 @@ func setupSuite() {
 		e2elog.Logf("WARNING: Waiting for all daemonsets to be ready failed: %v", err)
 	}
 
+	ginkgo.By("Initializing all nodes")
+	nodeList, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
+	framework.ExpectNoError(err)
+	for _, node := range nodeList.Items {
+		framework.Logf("Initializing node %q", node.Name)
+		framework.ExpectNoError(utilnode.InitNode(&node))
+	}
+
 	// By using default storage class in GKE/EKS (aws), network attached storage
 	// which be used and we must clean them later.
 	// We set local-storage class as default for simplicity.
diff --git a/tests/e2e/tidbcluster/stability.go b/tests/e2e/tidbcluster/stability.go
index bb829a3830..d56659d85a 100644
--- a/tests/e2e/tidbcluster/stability.go
+++ b/tests/e2e/tidbcluster/stability.go
@@ -20,26 +20,35 @@ import (
 	"time"
 
 	"github.com/onsi/ginkgo"
+	"github.com/onsi/gomega"
 	asclientset "github.com/pingcap/advanced-statefulset/pkg/client/clientset/versioned"
+	"github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1"
 	"github.com/pingcap/tidb-operator/pkg/client/clientset/versioned"
 	"github.com/pingcap/tidb-operator/pkg/label"
 	"github.com/pingcap/tidb-operator/pkg/scheme"
 	"github.com/pingcap/tidb-operator/tests"
 	e2econfig "github.com/pingcap/tidb-operator/tests/e2e/config"
 	utilimage "github.com/pingcap/tidb-operator/tests/e2e/util/image"
+	utilnode "github.com/pingcap/tidb-operator/tests/e2e/util/node"
 	utilpod "github.com/pingcap/tidb-operator/tests/e2e/util/pod"
 	"github.com/pingcap/tidb-operator/tests/e2e/util/portforward"
+	"github.com/pingcap/tidb-operator/tests/e2e/util/proxiedpdclient"
 	utiltidb "github.com/pingcap/tidb-operator/tests/e2e/util/tidb"
+	utiltikv "github.com/pingcap/tidb-operator/tests/e2e/util/tikv"
 	"github.com/pingcap/tidb-operator/tests/pkg/fixture"
 	v1 "k8s.io/api/core/v1"
 	apiextensionsclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/apimachinery/pkg/util/wait"
 	clientset "k8s.io/client-go/kubernetes"
 	restclient "k8s.io/client-go/rest"
 	aggregatorclient "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset"
 	"k8s.io/kubernetes/test/e2e/framework"
+	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
+	"k8s.io/utils/pointer"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
@@ -89,7 +98,6 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() {
 		}
 	})
 
-	// TODO generate more contexts for different operator values
 	ginkgo.Context("operator with default values", func() {
 		var ocfg *tests.OperatorConfig
 		var oa tests.OperatorActions
@@ -119,8 +127,6 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() {
 		ginkgo.AfterEach(func() {
 			ginkgo.By("Uninstall tidb-operator")
 			oa.CleanOperatorOrDie(ocfg)
-			ginkgo.By("Uninstalling CRDs")
-			oa.CleanCRDOrDie()
 		})
 
 		testCases := []struct {
@@ -183,6 +189,329 @@ var _ = ginkgo.Describe("[tidb-operator][Stability]", func() {
 				framework.ExpectEqual(err, wait.ErrWaitTimeout, "TiDB cluster is not affeteced")
 			})
 		}
+	})
+
+	ginkgo.Context("operator with auto-failover disabled", func() {
+		var ocfg *tests.OperatorConfig
+		var oa tests.OperatorActions
+		var genericCli client.Client
+
+		ginkgo.BeforeEach(func() {
+			ocfg = &tests.OperatorConfig{
+				Namespace:    ns,
+				ReleaseName:  "operator",
+				Image:        cfg.OperatorImage,
+				Tag:          cfg.OperatorTag,
+				LogLevel:     "4",
+				TestMode:     true,
+				AutoFailover: pointer.BoolPtr(false),
+			}
+			oa = tests.NewOperatorActions(cli, c, asCli, aggrCli, apiExtCli, tests.DefaultPollInterval, ocfg, e2econfig.TestConfig, nil, fw, f)
+			ginkgo.By("Installing CRDs")
+			oa.CleanCRDOrDie()
+			oa.InstallCRDOrDie(ocfg)
+			ginkgo.By("Installing tidb-operator")
+			oa.CleanOperatorOrDie(ocfg)
+			oa.DeployOperatorOrDie(ocfg)
+			var err error
+			genericCli, err = client.New(config, client.Options{Scheme: scheme.Scheme})
+			framework.ExpectNoError(err, "failed to create clientset")
+		})
+
+		ginkgo.AfterEach(func() {
+			ginkgo.By("Uninstall tidb-operator")
+			oa.CleanOperatorOrDie(ocfg)
+		})
+
+		// In this test, we demonstrate and verify the recover process when a
+		// node (and local storage on it) is permanently gone.
+		//
+		// In cloud, a node can be deleted manually or reclaimed by a
+		// controller (e.g. auto scaling group if ReplaceUnhealthy not
+		// suspended). Local storage on it will be permanently unaccessible.
+		// Manual intervention is required to recover from this situation.
+		// Basic steps will be:
+		//
+		// - for TiKV, delete associated store ID in PD
+		//   - because we use network identity as store address, if we want to
+		//   recover in place, we should delete the previous store at the same
+		//   address. This requires us to set it to tombstone directly because
+		//   the data is permanent lost, there is no way to delete it gracefully.
+		//   - optionally, Advnaced StatefulSet can be used to recover with
+		//   different network identity
+		// - for PD, like TiKV we must delete its member from the cluster
+		// - (EKS only) delete pvcs of failed pods
+		//   - in EKS, failed pods on deleted node will be recreated because
+		//   the node object is gone too (old pods is recycled by pod gc). But
+		//   the newly created pods will be stuck at Pending state because
+		//   associated PVs are invalid now. Pods will be recreated by
+		//   tidb-operator again when we delete associated PVCs. New PVCs will
+		//   be created by statefulset controller and pods will be scheduled to
+		//   feasible nodes.
+		//   - it's highly recommended to enable `setPVOwnerRef` in
+		//   local-volume-provisioner, then orphan PVs will be garbaged
+		//   collected and will not cause problem even if the name of deleted
+		//   node is used again in the future.
+		// - (GKE only, fixed path) nothing need to do
+		//   - Because the node name does not change, old PVs can be used. Note
+		//   that `setPVOwnerRef` cannot be enabled because the node object
+		//   could get deleted if it takes too long for the instance to
+		//   recreate.
+		//   - Optionally, you can deleted failed pods to make them to start
+		//   soon. This is due to exponential crash loop back off.
+		// - (GKE only, unique paths) delete failed pods and associated PVCs/PVs
+		//   - This is because even if the node name does not change, old PVs
+		//   are invalid because unique volume paths are used. We must delete
+		//   them all and wait for Kubernetes to rcreate and run again.
+		//   - PVs must be deleted because the PVs are invalid and should not
+		//   exist anymore. We can configure `setPVOwnerRef` to clean unused
+		//   PVs when the node object is deleted, but the node object will not
+		//   get deleted if the instance is recreated soon.
+		//
+		// Note that:
+		// - We assume local storage is used, otherwise PV can be re-attached
+		// the new node without problem.
+		// - PD and TiKV must have at least 3 replicas, otherwise one node
+		// deletion will cause permanent data loss and the cluster will be unrecoverable.
+		// - Of course, this process can be automated by implementing a
+		// controller integrated with cloud providers. It's outside the scope
+		// of tidb-operator now.
+		// - The same process can apply in bare-metal environment too when a
+		// machine or local storage is permanently gone.
+		//
+		// Differences between EKS and GKE:
+		//
+		// - In EKS, a new node object with different name will be created for
+		// the new machine.
+		// - In GKE (1.11+), the node object are no longer recreated on
+		// upgrade/repair even though the underlying instance is recreated and
+		// local disks are wiped. However, the node object could get deleted by
+		// cloud-controller-manager if it takes too long for the instance to
+		// recreate.
+		//
+		// Related issues:
+		// - https://github.com/pingcap/tidb-operator/issues/1546
+		// - https://github.com/pingcap/tidb-operator/issues/408
+		ginkgo.It("recover tidb cluster from node deletion", func() {
+			supportedProviders := sets.NewString("aws", "gke")
+			if !supportedProviders.Has(framework.TestContext.Provider) {
+				framework.Skipf("current provider is not supported list %v, skipping", supportedProviders.List())
+			}
+
+			ginkgo.By("Wait for all nodes are schedulable")
+			framework.ExpectNoError(framework.WaitForAllNodesSchedulable(c, framework.TestContext.NodeSchedulableTimeout))
+
+			ginkgo.By("Make sure we have at least 3 schedulable nodes")
+			nodeList := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
+			gomega.Expect(len(nodeList.Items)).To(gomega.BeNumerically(">=", 3))
+
+			ginkgo.By("Deploy a test cluster with 3 pd and tikv replicas")
+			clusterName := "test"
+			tc := fixture.GetTidbCluster(ns, clusterName, utilimage.TiDBV3Version)
+			tc.Spec.PD.Replicas = 3
+			tc.Spec.TiDB.Replicas = 1
+			tc.Spec.TiKV.Replicas = 3
+			err := genericCli.Create(context.TODO(), tc)
+			framework.ExpectNoError(err)
+			err = oa.WaitForTidbClusterReady(tc, 30*time.Minute, 15*time.Second)
+			framework.ExpectNoError(err)
+
+			ginkgo.By("By using tidb-scheduler, 3 TiKV/PD replicas should be on different nodes")
+			allNodes := make(map[string]v1.Node)
+			for _, node := range nodeList.Items {
+				allNodes[node.Name] = node
+			}
+			allTiKVNodes := make(map[string]v1.Node)
+			allPDNodes := make(map[string]v1.Node)
+			listOptions := metav1.ListOptions{
+				LabelSelector: labels.SelectorFromSet(label.New().Instance(clusterName).Labels()).String(),
+			}
+			podList, err := c.CoreV1().Pods(ns).List(listOptions)
+			framework.ExpectNoError(err)
+			for _, pod := range podList.Items {
+				if v, ok := pod.Labels[label.ComponentLabelKey]; !ok {
+					framework.Failf("pod %s/%s does not have component label key %q", pod.Namespace, pod.Name, label.ComponentLabelKey)
+				} else if v == label.PDLabelVal {
+					allPDNodes[pod.Name] = allNodes[pod.Spec.NodeName]
+				} else if v == label.TiKVLabelVal {
+					allTiKVNodes[pod.Name] = allNodes[pod.Spec.NodeName]
+				} else {
+					continue
+				}
+			}
+			gomega.Expect(len(allPDNodes)).To(gomega.BeNumerically("==", 3), "the number of pd nodes should be 3")
+			gomega.Expect(len(allTiKVNodes)).To(gomega.BeNumerically("==", 3), "the number of tikv nodes should be 3")
+
+			ginkgo.By("Deleting a node")
+			var nodeToDelete *v1.Node
+			for _, node := range allTiKVNodes {
+				if nodeToDelete == nil {
+					nodeToDelete = &node
+					break
+				}
+			}
+			gomega.Expect(nodeToDelete).NotTo(gomega.BeNil())
+			var pdPodsOnDeletedNode []v1.Pod
+			var tikvPodsOnDeletedNode []v1.Pod
+			var pvcNamesOnDeletedNode []string
+			for _, pod := range podList.Items {
+				if pod.Spec.NodeName == nodeToDelete.Name {
+					if v, ok := pod.Labels[label.ComponentLabelKey]; ok {
+						if v == label.PDLabelVal {
+							pdPodsOnDeletedNode = append(pdPodsOnDeletedNode, pod)
+						} else if v == label.TiKVLabelVal {
+							tikvPodsOnDeletedNode = append(tikvPodsOnDeletedNode, pod)
+						}
+					}
+					for _, volume := range pod.Spec.Volumes {
+						if volume.PersistentVolumeClaim != nil {
+							pvcNamesOnDeletedNode = append(pvcNamesOnDeletedNode, volume.PersistentVolumeClaim.ClaimName)
+						}
+					}
+				}
+			}
+			gomega.Expect(len(tikvPodsOnDeletedNode)).To(gomega.BeNumerically(">=", 1), "the number of affected tikvs must be equal or greater than 1")
+			err = framework.DeleteNodeOnCloudProvider(nodeToDelete)
+			framework.ExpectNoError(err, fmt.Sprintf("failed to delete node %q", nodeToDelete.Name))
+			framework.Logf("Node %q deleted", nodeToDelete.Name)
+
+			if framework.TestContext.Provider == "aws" {
+				// The node object will be gone with physical machine.
+				ginkgo.By(fmt.Sprintf("[AWS/EKS] Wait for the node object %q to be deleted", nodeToDelete.Name))
+				err = wait.PollImmediate(time.Second*5, time.Minute*5, func() (bool, error) {
+					_, err = c.CoreV1().Nodes().Get(nodeToDelete.Name, metav1.GetOptions{})
+					if err == nil || !apierrors.IsNotFound(err) {
+						return false, nil
+					}
+					return true, nil
+				})
+				framework.ExpectNoError(err)
+
+				ginkgo.By("[AWS/EKS] New instance will be created and join the cluster")
+				_, err := e2enode.CheckReady(c, len(nodeList.Items), 5*time.Minute)
+				framework.ExpectNoError(err)
+
+				ginkgo.By("[AWS/EKS] Initialize newly created node")
+				nodeList, err = c.CoreV1().Nodes().List(metav1.ListOptions{})
+				framework.ExpectNoError(err)
+				initialized := 0
+				for _, node := range nodeList.Items {
+					if _, ok := allNodes[node.Name]; !ok {
+						framework.ExpectNoError(utilnode.InitNode(&node))
+						initialized++
+					}
+				}
+				gomega.Expect(initialized).To(gomega.BeNumerically("==", 1), "must have a node initialized")
+			} else if framework.TestContext.Provider == "gke" {
+				instanceIDAnn := "container.googleapis.com/instance_id"
+				oldInstanceID, ok := nodeToDelete.Annotations[instanceIDAnn]
+				if !ok {
+					framework.Failf("instance label %q not found on node object %q", instanceIDAnn, nodeToDelete.Name)
+				}
+
+				ginkgo.By("[GCP/GKE] Wait for instance ID to be updated")
+				err = wait.PollImmediate(time.Second*5, time.Minute*10, func() (bool, error) {
+					node, err := c.CoreV1().Nodes().Get(nodeToDelete.Name, metav1.GetOptions{})
+					if err != nil {
+						return false, nil
+					}
+					instanceID, ok := node.Annotations[instanceIDAnn]
+					if !ok {
+						return false, nil
+					}
+					if instanceID == oldInstanceID {
+						return false, nil
+					}
+					framework.Logf("instance ID of node %q changed from %q to %q", nodeToDelete.Name, oldInstanceID, instanceID)
+					return true, nil
+				})
+				framework.ExpectNoError(err)
+
+				ginkgo.By("[GCP/GKE] Wait for the node to be ready")
+				e2enode.WaitForNodeToBeReady(c, nodeToDelete.Name, time.Minute*5)
+
+				ginkgo.By(fmt.Sprintf("[GCP/GKE] Initialize underlying machine of node %s", nodeToDelete.Name))
+				node, err := c.CoreV1().Nodes().Get(nodeToDelete.Name, metav1.GetOptions{})
+				framework.ExpectNoError(err)
+				framework.ExpectNoError(utilnode.InitNode(node))
+			}
+
+			ginkgo.By("Mark stores of failed tikv pods as tombstone")
+			pdClient, cancel, err := proxiedpdclient.NewProxiedPDClient(c, fw, ns, clusterName, false, nil)
+			framework.ExpectNoError(err)
+			defer func() {
+				if cancel != nil {
+					cancel()
+				}
+			}()
+			for _, pod := range tikvPodsOnDeletedNode {
+				framework.Logf("Mark tikv store of pod %s/%s as Tombstone", ns, pod.Name)
+				err = wait.PollImmediate(time.Second*3, time.Minute, func() (bool, error) {
+					storeID, err := utiltikv.GetStoreIDByPodName(cli, ns, clusterName, pod.Name)
+					if err != nil {
+						return false, nil
+					}
+					err = pdClient.SetStoreState(storeID, v1alpha1.TiKVStateTombstone)
+					if err != nil {
+						return false, nil
+					}
+					return true, nil
+				})
+				framework.ExpectNoError(err)
+			}
+			ginkgo.By("Delete pd members")
+			for _, pod := range pdPodsOnDeletedNode {
+				framework.Logf("Delete pd member of pod %s/%s", ns, pod.Name)
+				err = wait.PollImmediate(time.Second*3, time.Minute, func() (bool, error) {
+					err = pdClient.DeleteMember(pod.Name)
+					if err != nil {
+						return false, nil
+					}
+					return true, nil
+				})
+				framework.ExpectNoError(err)
+			}
+			cancel()
+			cancel = nil
+
+			if framework.TestContext.Provider == "aws" {
+				// Local storage is gone with the node and local PVs on deleted
+				// node will be unusable.
+				// If `setPVOwnerRef` is enabled in local-volume-provisioner,
+				// local PVs will be deleted when the node object is deleted
+				// and permanently gone in apiserver when associated PVCs are
+				// delete here.
+				ginkgo.By("[AWS/EKS] Delete associated PVCs if they are bound with local PVs")
+				localPVs := make([]string, 0)
+				for _, pvcName := range pvcNamesOnDeletedNode {
+					pvc, err := c.CoreV1().PersistentVolumeClaims(ns).Get(pvcName, metav1.GetOptions{})
+					if err != nil && !apierrors.IsNotFound(err) {
+						framework.Failf("apiserver error: %v", err)
+					}
+					if apierrors.IsNotFound(err) {
+						continue
+					}
+					if pvc.Spec.StorageClassName != nil && *pvc.Spec.StorageClassName == "local-storage" {
+						localPVs = append(localPVs, pvc.Spec.VolumeName)
+						err = c.CoreV1().PersistentVolumeClaims(ns).Delete(pvc.Name, &metav1.DeleteOptions{})
+						framework.ExpectNoError(err)
+					}
+				}
+			} else if framework.TestContext.Provider == "gke" {
+				framework.Logf("We are using fixed paths in local PVs in our e2e. PVs of the deleted node are usable though the underlying storage is empty now")
+				// Because of pod exponential crash loop back off, we can
+				// delete the failed pods to make it start soon.
+				// Note that this is optional.
+				ginkgo.By("Deleting the failed pods")
+				for _, pod := range append(tikvPodsOnDeletedNode, pdPodsOnDeletedNode...) {
+					framework.ExpectNoError(c.CoreV1().Pods(ns).Delete(pod.Name, &metav1.DeleteOptions{}))
+				}
+			}
+
+			ginkgo.By("Waiting for tidb cluster to be fully ready")
+			err = oa.WaitForTidbClusterReady(tc, 5*time.Minute, 15*time.Second)
+			framework.ExpectNoError(err)
+		})
 
 	})
 
diff --git a/tests/e2e/util/node/node.go b/tests/e2e/util/node/node.go
new file mode 100644
index 0000000000..b8a873706f
--- /dev/null
+++ b/tests/e2e/util/node/node.go
@@ -0,0 +1,82 @@
+// Copyright 2020 PingCAP, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package node
+
+import (
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/kubernetes/test/e2e/framework"
+	"k8s.io/kubernetes/test/e2e/framework/ssh"
+)
+
+var (
+	awsNodeInitCmd = `
+sudo bash -c '
+test -d /mnt/disks || mkdir -p /mnt/disks
+df -h /mnt/disks
+if mountpoint /mnt/disks &>/dev/null; then
+    echo "info: /mnt/disks is a mountpoint"
+else
+    echo "info: /mnt/disks is not a mountpoint, creating local volumes on the rootfs"
+fi
+cd /mnt/disks
+for ((i = 1; i <= 32; i++)) {
+    if [ ! -d vol$i ]; then
+        mkdir vol$i
+    fi
+    if ! mountpoint vol$i &>/dev/null; then
+        mount --bind vol$i vol$i
+    fi
+}
+echo "info: increase max open files for containers"
+if ! grep -qF "OPTIONS" /etc/sysconfig/docker; then
+    echo 'OPTIONS="--default-ulimit nofile=1024000:1024000"' >> /etc/sysconfig/docker
+fi
+systemctl restart docker
+'
+`
+	// disks are created under /mnt/stateful_partition directory
+	// https://cloud.google.com/container-optimized-os/docs/concepts/disks-and-filesystem
+	gkeNodeInitCmd = `
+sudo bash -c '
+test -d /mnt/stateful_partition/disks || mkdir -p /mnt/stateful_partition/disks
+df -h /mnt/stateful_partition/disks
+test -d /mnt/disks || mkdir -p /mnt/disks
+cd /mnt/disks
+for ((i = 1; i <= 32; i++)) {
+    if [ ! -d vol$i ]; then
+        mkdir vol$i
+    fi
+    if ! mountpoint vol$i &>/dev/null; then
+        if [ ! -d /mnt/stateful_partition/disks/vol$i ]; then
+            mkdir /mnt/stateful_partition/disks/vol$i
+        fi
+        mount --bind /mnt/stateful_partition/disks/vol$i vol$i
+    fi
+}
+'
+`
+)
+
+func InitNode(node *v1.Node) error {
+	var initNodeCmd string
+	if framework.TestContext.Provider == "aws" {
+		initNodeCmd = awsNodeInitCmd
+	} else if framework.TestContext.Provider == "gke" {
+		initNodeCmd = gkeNodeInitCmd
+	} else {
+		framework.Logf("Unknown provider %q, skipped", framework.TestContext.Provider)
+		return nil
+	}
+	return ssh.IssueSSHCommand(initNodeCmd, framework.TestContext.Provider, node)
+}
diff --git a/tests/e2e/util/tikv/tikv.go b/tests/e2e/util/tikv/tikv.go
new file mode 100644
index 0000000000..1ecbf0d6db
--- /dev/null
+++ b/tests/e2e/util/tikv/tikv.go
@@ -0,0 +1,35 @@
+// Copyright 2020 PingCAP, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tikv
+
+import (
+	"fmt"
+	"strconv"
+
+	"github.com/pingcap/tidb-operator/pkg/client/clientset/versioned"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+func GetStoreIDByPodName(c versioned.Interface, ns, clusterName, podName string) (uint64, error) {
+	tc, err := c.PingcapV1alpha1().TidbClusters(ns).Get(clusterName, metav1.GetOptions{})
+	if err != nil {
+		return 0, err
+	}
+	for _, store := range tc.Status.TiKV.Stores {
+		if store.PodName == podName {
+			return strconv.ParseUint(store.ID, 10, 64)
+		}
+	}
+	return 0, fmt.Errorf("tikv store of pod %q not found in cluster %s/%s", podName, ns, clusterName)
+}