Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add a recovery test on node deletion for eks/gke #2119

Merged
merged 5 commits into from
Apr 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions hack/e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ Environments:
AWS_ACCESS_KEY_ID (eks only) the aws access key id
AWS_SECRET_ACCESS_KEY (eks only) the aws secret access key
AWS_REGION (eks only) the aws region
AWS_ZONE (eks only) the aws zone
GINKGO_NODES ginkgo nodes to run specs, defaults: 1
GINKGO_PARALLEL if set to `y`, will run specs in parallel, the number of nodes will be the number of cpus
GINKGO_NO_COLOR if set to `y`, suppress color output in default reporter
Expand Down Expand Up @@ -197,6 +198,7 @@ GCP_MACHINE_TYPE=${GCP_MACHINE_TYPE:-n1-standard-4}
AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-}
AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-}
AWS_REGION=${AWS_REGION:-}
AWS_ZONE=${AWS_ZONE:-}
KUBE_VERSION=${KUBE_VERSION:-v1.12.10}
KUBE_WORKERS=${KUBE_WORKERS:-3}
DOCKER_IO_MIRROR=${DOCKER_IO_MIRROR:-}
Expand All @@ -223,6 +225,7 @@ echo "GCP_ZONE: $GCP_ZONE"
# echo "AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID"
# echo "AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY"
echo "AWS_REGION: $AWS_REGION"
echo "AWS_ZONE: $AWS_ZONE"
echo "KUBE_VERSION: $KUBE_VERSION"
echo "KUBE_WORKERS: $KUBE_WORKERS"
echo "DOCKER_IO_MIRROR: $DOCKER_IO_MIRROR"
Expand Down Expand Up @@ -465,10 +468,14 @@ EOF
)
fi
elif [ "$PROVIDER" == "eks" ]; then
export KUBE_SSH_USER=ec2-user
hack::ensure_aws_k8s_tester
if [ -n "$AWS_REGION" ]; then
aws configure set default.region "$AWS_REGION"
fi
if [ -z "$AWS_ZONE" ]; then
AWS_ZONE=${AWS_REGION}a
fi
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
aws configure set aws_access_key_id "$AWS_ACCESS_KEY_ID"
fi
Expand Down Expand Up @@ -505,6 +512,10 @@ export GCP_PROJECT
export GCP_REGION
export GCP_ZONE
export GCP_CREDENTIALS
export AWS_ACCESS_KEY_ID
export AWS_SECRET_ACCESS_KEY
export AWS_REGION
export AWS_ZONE
export IMAGE_TAG
export SKIP_GINKGO
export SKIP_IMAGE_LOAD
Expand Down
86 changes: 24 additions & 62 deletions hack/run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ GCP_REGION=${GCP_REGION:-}
GCP_ZONE=${GCP_ZONE:-}
GCP_CREDENTIALS=${GCP_CREDENTIALS:-}
GCP_SDK=${GCP_SDK:-/google-cloud-sdk}
KUBE_SSH_USER=${KUBE_SSH_USER:-vagrant}
IMAGE_TAG=${IMAGE_TAG:-}
SKIP_IMAGE_LOAD=${SKIP_IMAGE_LOAD:-}
TIDB_OPERATOR_IMAGE=${TIDB_OPERATOR_IMAGE:-localhost:5000/pingcap/tidb-operator:latest}
Expand All @@ -51,6 +52,7 @@ if [ -z "$KUBECONFIG" ]; then
exit 1
fi

echo "KUBE_SSH_USER: $KUBE_SSH_USER"
echo "TIDB_OPERATOR_IMAGE: $TIDB_OPERATOR_IMAGE"
echo "TIDB_BACKUP_MANAGER_IMAGE: $TIDB_BACKUP_MANAGER_IMAGE"
echo "E2E_IMAGE: $E2E_IMAGE"
Expand Down Expand Up @@ -123,67 +125,15 @@ for ((i = 1; i <= 32; i++)) {
EOF
done
elif [ "$PROVIDER" == "gke" ]; then
# disks are created under /mnt/stateful_partition directory
# https://cloud.google.com/container-optimized-os/docs/concepts/disks-and-filesystem
for n in $($KUBECTL_BIN --context "$KUBECONTEXT" get nodes -ojsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}'); do
gcloud compute ssh e2e@$n --command 'sudo bash -c '"'"'
test -d /mnt/stateful_partition/disks || mkdir -p /mnt/stateful_partition/disks
df -h /mnt/stateful_partition/disks
test -d /mnt/disks || mkdir -p /mnt/disks
cd /mnt/disks
for ((i = 1; i <= 32; i++)) {
if [ ! -d vol$i ]; then
mkdir vol$i
fi
if ! mountpoint vol$i &>/dev/null; then
if [ ! -d /mnt/stateful_partition/disks/vol$i ]; then
mkdir /mnt/stateful_partition/disks/vol$i
fi
mount --bind /mnt/stateful_partition/disks/vol$i vol$i
fi
}
'"'"
done
echo "info: provider is $PROVIDER, skipped"
elif [ "$PROVIDER" == "eks" ]; then
while IFS=$'\n' read -r line; do
read -r id dns <<< $line
echo "info: prepare disks on $dns"
ssh -T -o "StrictHostKeyChecking no" -i ~/.ssh/kube_aws_rsa ec2-user@$dns <<'EOF'
sudo bash -c '
test -d /mnt/disks || mkdir -p /mnt/disks
df -h /mnt/disks
if mountpoint /mnt/disks &>/dev/null; then
echo "info: /mnt/disks is a mountpoint"
else
echo "info: /mnt/disks is not a mountpoint, creating local volumes on the rootfs"
fi
cd /mnt/disks
for ((i = 1; i <= 32; i++)) {
if [ ! -d vol$i ]; then
mkdir vol$i
fi
if ! mountpoint vol$i &>/dev/null; then
mount --bind vol$i vol$i
fi
}
echo "info: increase max open files for containers"
if ! grep -qF "OPTIONS" /etc/sysconfig/docker; then
echo 'OPTIONS="--default-ulimit nofile=1024000:1024000"' >> /etc/sysconfig/docker
fi
systemctl restart docker
'
EOF
done <<< "$(e2e::__eks_instances)"
echo "info: provider is $PROVIDER, skipped"
fi
echo "info: installing local-volume-provisioner"
$KUBECTL_BIN --context $KUBECONTEXT apply -f ${ROOT}/manifests/local-dind/local-volume-provisioner.yaml
e2e::__wait_for_ds kube-system local-volume-provisioner
}

function e2e::__eks_instances() {
aws ec2 describe-instances --filter Name=tag:eks:cluster-name,Values=$CLUSTER --query 'Reservations[*].Instances[*].{InstanceId:InstanceId,PublicDnsName:PublicDnsName}' --output text
}

function e2e::__ecr_url() {
local account_id=$(aws sts get-caller-identity --output text | awk '{print $1}')
local region=$(aws configure get region)
Expand Down Expand Up @@ -211,6 +161,13 @@ function e2e::setup_helm_server() {
$HELM_BIN version
}

# Used by non-kind providers to tag image with its id. This can force our e2e
# process to pull correct image even if IfNotPresent is used in an existing
# environment, e.g. testing in the same cluster.
function e2e::image_id_tag() {
docker image inspect -f '{{.Id}}' "$1" | cut -d ':' -f 2 | head -c 10
}

function e2e::image_load() {
local images=(
$TIDB_OPERATOR_IMAGE
Expand All @@ -226,9 +183,9 @@ function e2e::image_load() {
elif [ "$PROVIDER" == "gke" ]; then
unset DOCKER_CONFIG # We don't need this and it may be read-only and fail the command to fail
gcloud auth configure-docker
GCP_TIDB_OPERATOR_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator:$CLUSTER-$IMAGE_TAG
GCP_TIDB_BACKUP_MANAGER_IMAGE=gcr.io/$GCP_PROJECT/tidb-backup-image:$CLUSTER-$IMAGE_TAG
GCP_E2E_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator-e2e:$CLUSTER-$IMAGE_TAG
GCP_TIDB_OPERATOR_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator:$CLUSTER-$(e2e::image_id_tag $TIDB_OPERATOR_IMAGE)
GCP_TIDB_BACKUP_MANAGER_IMAGE=gcr.io/$GCP_PROJECT/tidb-backup-image:$CLUSTER-$(e2e::image_id_tag $TIDB_BACKUP_MANAGER_IMAGE)
GCP_E2E_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator-e2e:$CLUSTER-$(e2e::image_id_tag $E2E_IMAGE)
docker tag $TIDB_OPERATOR_IMAGE $GCP_TIDB_OPERATOR_IMAGE
docker tag $E2E_IMAGE $GCP_E2E_IMAGE
docker tag $TIDB_BACKUP_MANAGER_IMAGE $GCP_TIDB_BACKUP_MANAGER_IMAGE
Expand All @@ -253,9 +210,9 @@ function e2e::image_load() {
local ecrURL=$(e2e::__ecr_url)
echo "info: logging in $ecrURL"
aws ecr get-login-password | docker login --username AWS --password-stdin $ecrURL
AWS_TIDB_OPERATOR_IMAGE=$ecrURL/e2e/tidb-operator:$CLUSTER-$IMAGE_TAG
AWS_TIDB_BACKUP_MANAGER_IMAGE=$ecrURL/e2e/tidb-backup-manager:$CLUSTER-$IMAGE_TAG
AWS_E2E_IMAGE=$ecrURL/e2e/tidb-operator-e2e:$CLUSTER-$IMAGE_TAG
AWS_TIDB_OPERATOR_IMAGE=$ecrURL/e2e/tidb-operator:$CLUSTER-$(e2e::image_id_tag $TIDB_OPERATOR_IMAGE)
AWS_TIDB_BACKUP_MANAGER_IMAGE=$ecrURL/e2e/tidb-backup-manager:$CLUSTER-$(e2e::image_id_tag $TIDB_BACKUP_MANAGER_IMAGE)
AWS_E2E_IMAGE=$ecrURL/e2e/tidb-operator-e2e:$CLUSTER-$(e2e::image_id_tag $E2E_IMAGE)
docker tag $TIDB_OPERATOR_IMAGE $AWS_TIDB_OPERATOR_IMAGE
docker tag $TIDB_BACKUP_MANAGER_IMAGE $AWS_TIDB_BACKUP_MANAGER_IMAGE
docker tag $E2E_IMAGE $AWS_E2E_IMAGE
Expand Down Expand Up @@ -363,16 +320,19 @@ docker_args=(
-v $KUBECONFIG:/etc/kubernetes/admin.conf:ro
--env KUBECONFIG=/etc/kubernetes/admin.conf
--env KUBECONTEXT=$KUBECONTEXT
--env KUBE_SSH_USER=$KUBE_SSH_USER
)

if [ "$PROVIDER" == "eks" ]; then
e2e_args+=(
--provider=aws
--gce-zone="${AWS_REGION}"
--gce-zone="${AWS_ZONE}" # reuse gce-zone to configure aws zone
)
# aws credential is required to get token for EKS
docker_args+=(
# aws credential is required to get token for EKS
-v $HOME/.aws:/root/.aws
# ~/.ssh/kube_aws_rsa must be mounted into e2e container to run ssh
-v $HOME/.ssh/kube_aws_rsa:/root/.ssh/kube_aws_rsa
)
elif [ "$PROVIDER" == "gke" ]; then
e2e_args+=(
Expand All @@ -393,6 +353,8 @@ elif [ "$PROVIDER" == "gke" ]; then
fi
docker_args+=(
-v ${GCP_SDK}:/google-cloud-sdk
# ~/.ssh/google_compute_engine must be mounted into e2e container to run ssh
-v $HOME/.ssh/google_compute_engine:/root/.ssh/google_compute_engine
)
else
e2e_args+=(
Expand Down
1 change: 1 addition & 0 deletions manifests/gke/local-ssd-provision/local-ssd-provision.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ metadata:
name: local-provisioner-config
namespace: kube-system
data:
setPVOwnerRef: "true"
nodeLabelsForPV: |
- kubernetes.io/hostname
storageClassMap: |
Expand Down
1 change: 1 addition & 0 deletions manifests/local-dind/local-volume-provisioner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ metadata:
name: local-provisioner-config
namespace: kube-system
data:
setPVOwnerRef: "true"
nodeLabelsForPV: |
- kubernetes.io/hostname
storageClassMap: |
Expand Down
36 changes: 36 additions & 0 deletions pkg/pdapi/pdapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ type PDClient interface {
SetStoreLabels(storeID uint64, labels map[string]string) (bool, error)
// DeleteStore deletes a TiKV store from cluster
DeleteStore(storeID uint64) error
// SetStoreState sets store to specified state.
SetStoreState(storeID uint64, state string) error
// DeleteMember deletes a PD member from cluster
DeleteMember(name string) error
// DeleteMemberByID deletes a PD member from cluster
Expand Down Expand Up @@ -403,6 +405,30 @@ func (pc *pdClient) DeleteStore(storeID uint64) error {
return fmt.Errorf("failed to delete store %d: %v", storeID, string(body))
}

// SetStoreState sets store to specified state.
func (pc *pdClient) SetStoreState(storeID uint64, state string) error {
apiURL := fmt.Sprintf("%s/%s/%d/state?state=%s", pc.url, storePrefix, storeID, state)
req, err := http.NewRequest("POST", apiURL, nil)
if err != nil {
return err
}
res, err := pc.httpClient.Do(req)
if err != nil {
return err
}
defer httputil.DeferClose(res.Body)

if res.StatusCode == http.StatusOK || res.StatusCode == http.StatusNotFound {
return nil
}
body, err := ioutil.ReadAll(res.Body)
if err != nil {
return err
}

return fmt.Errorf("failed to delete store %d: %v", storeID, string(body))
}

func (pc *pdClient) DeleteMemberByID(memberID uint64) error {
var exist bool
members, err := pc.GetMembers()
Expand Down Expand Up @@ -666,6 +692,7 @@ const (
GetTombStoneStoresActionType ActionType = "GetTombStoneStores"
GetStoreActionType ActionType = "GetStore"
DeleteStoreActionType ActionType = "DeleteStore"
SetStoreStateActionType ActionType = "SetStoreState"
DeleteMemberByIDActionType ActionType = "DeleteMemberByID"
DeleteMemberActionType ActionType = "DeleteMember "
SetStoreLabelsActionType ActionType = "SetStoreLabels"
Expand Down Expand Up @@ -790,6 +817,15 @@ func (pc *FakePDClient) DeleteStore(id uint64) error {
return nil
}

func (pc *FakePDClient) SetStoreState(id uint64, state string) error {
if reaction, ok := pc.reactions[SetStoreStateActionType]; ok {
action := &Action{ID: id}
_, err := reaction(action)
return err
}
return nil
}

func (pc *FakePDClient) DeleteMemberByID(id uint64) error {
if reaction, ok := pc.reactions[DeleteMemberByIDActionType]; ok {
action := &Action{ID: id}
Expand Down
5 changes: 4 additions & 1 deletion tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ type OperatorConfig struct {
ValidatingEnabled bool
Cabundle string
BackupImage string
AutoFailover *bool
}

type TidbClusterConfig struct {
Expand Down Expand Up @@ -408,7 +409,6 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string {
set := map[string]string{
"operatorImage": oi.Image,
"tidbBackupManagerImage": oi.BackupImage,
"controllerManager.autoFailover": "true",
"scheduler.logLevel": "4",
"testMode": strconv.FormatBool(oi.TestMode),
"admissionWebhook.cabundle": oi.Cabundle,
Expand Down Expand Up @@ -442,6 +442,9 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string {
if oi.Enabled(features.AdvancedStatefulSet) {
set["advancedStatefulset.create"] = "true"
}
if oi.AutoFailover != nil {
set["controllerManager.autoFailover"] = strconv.FormatBool(*oi.AutoFailover)
}

arr := make([]string, 0, len(set))
for k, v := range set {
Expand Down
9 changes: 9 additions & 0 deletions tests/e2e/e2e.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/pingcap/tidb-operator/tests"
e2econfig "github.com/pingcap/tidb-operator/tests/e2e/config"
utilimage "github.com/pingcap/tidb-operator/tests/e2e/util/image"
utilnode "github.com/pingcap/tidb-operator/tests/e2e/util/node"
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
apiextensionsclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
Expand Down Expand Up @@ -122,6 +123,14 @@ func setupSuite() {
e2elog.Logf("WARNING: Waiting for all daemonsets to be ready failed: %v", err)
}

ginkgo.By("Initializing all nodes")
nodeList, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
framework.ExpectNoError(err)
for _, node := range nodeList.Items {
framework.Logf("Initializing node %q", node.Name)
framework.ExpectNoError(utilnode.InitNode(&node))
}

// By using default storage class in GKE/EKS (aws), network attached storage
// which be used and we must clean them later.
// We set local-storage class as default for simplicity.
Expand Down
Loading