Skip to content

Commit

Permalink
add a recovery test on node deletion for aws/eks
Browse files Browse the repository at this point in the history
  • Loading branch information
cofyc committed Apr 3, 2020
1 parent 9c36ee8 commit 2725bfb
Show file tree
Hide file tree
Showing 10 changed files with 460 additions and 45 deletions.
11 changes: 11 additions & 0 deletions hack/e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ Environments:
AWS_ACCESS_KEY_ID (eks only) the aws access key id
AWS_SECRET_ACCESS_KEY (eks only) the aws secret access key
AWS_REGION (eks only) the aws region
AWS_ZONE (eks only) the aws zone
GINKGO_NODES ginkgo nodes to run specs, defaults: 1
GINKGO_PARALLEL if set to `y`, will run specs in parallel, the number of nodes will be the number of cpus
GINKGO_NO_COLOR if set to `y`, suppress color output in default reporter
Expand Down Expand Up @@ -197,6 +198,7 @@ GCP_MACHINE_TYPE=${GCP_MACHINE_TYPE:-n1-standard-4}
AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-}
AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-}
AWS_REGION=${AWS_REGION:-}
AWS_ZONE=${AWS_ZONE:-}
KUBE_VERSION=${KUBE_VERSION:-v1.12.10}
KUBE_WORKERS=${KUBE_WORKERS:-3}
DOCKER_IO_MIRROR=${DOCKER_IO_MIRROR:-}
Expand All @@ -223,6 +225,7 @@ echo "GCP_ZONE: $GCP_ZONE"
# echo "AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID"
# echo "AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY"
echo "AWS_REGION: $AWS_REGION"
echo "AWS_ZONE: $AWS_ZONE"
echo "KUBE_VERSION: $KUBE_VERSION"
echo "KUBE_WORKERS: $KUBE_WORKERS"
echo "DOCKER_IO_MIRROR: $DOCKER_IO_MIRROR"
Expand Down Expand Up @@ -465,10 +468,14 @@ EOF
)
fi
elif [ "$PROVIDER" == "eks" ]; then
export KUBE_SSH_USER=ec2-user
hack::ensure_aws_k8s_tester
if [ -n "$AWS_REGION" ]; then
aws configure set default.region "$AWS_REGION"
fi
if [ -z "$AWS_ZONE" ]; then
AWS_ZONE=${AWS_REGION}a
fi
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
aws configure set aws_access_key_id "$AWS_ACCESS_KEY_ID"
fi
Expand Down Expand Up @@ -505,6 +512,10 @@ export GCP_PROJECT
export GCP_REGION
export GCP_ZONE
export GCP_CREDENTIALS
export AWS_ACCESS_KEY_ID
export AWS_SECRET_ACCESS_KEY
export AWS_REGION
export AWS_ZONE
export IMAGE_TAG
export SKIP_GINKGO
export SKIP_IMAGE_LOAD
Expand Down
62 changes: 21 additions & 41 deletions hack/run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ GCP_REGION=${GCP_REGION:-}
GCP_ZONE=${GCP_ZONE:-}
GCP_CREDENTIALS=${GCP_CREDENTIALS:-}
GCP_SDK=${GCP_SDK:-/google-cloud-sdk}
KUBE_SSH_USER=${KUBE_SSH_USER:-}
IMAGE_TAG=${IMAGE_TAG:-}
SKIP_IMAGE_LOAD=${SKIP_IMAGE_LOAD:-}
TIDB_OPERATOR_IMAGE=${TIDB_OPERATOR_IMAGE:-localhost:5000/pingcap/tidb-operator:latest}
Expand All @@ -51,6 +52,7 @@ if [ -z "$KUBECONFIG" ]; then
exit 1
fi

echo "KUBE_SSH_USER: $KUBE_SSH_USER"
echo "TIDB_OPERATOR_IMAGE: $TIDB_OPERATOR_IMAGE"
echo "TIDB_BACKUP_MANAGER_IMAGE: $TIDB_BACKUP_MANAGER_IMAGE"
echo "E2E_IMAGE: $E2E_IMAGE"
Expand Down Expand Up @@ -145,45 +147,13 @@ for ((i = 1; i <= 32; i++)) {
'"'"
done
elif [ "$PROVIDER" == "eks" ]; then
while IFS=$'\n' read -r line; do
read -r id dns <<< $line
echo "info: prepare disks on $dns"
ssh -T -o "StrictHostKeyChecking no" -i ~/.ssh/kube_aws_rsa ec2-user@$dns <<'EOF'
sudo bash -c '
test -d /mnt/disks || mkdir -p /mnt/disks
df -h /mnt/disks
if mountpoint /mnt/disks &>/dev/null; then
echo "info: /mnt/disks is a mountpoint"
else
echo "info: /mnt/disks is not a mountpoint, creating local volumes on the rootfs"
fi
cd /mnt/disks
for ((i = 1; i <= 32; i++)) {
if [ ! -d vol$i ]; then
mkdir vol$i
fi
if ! mountpoint vol$i &>/dev/null; then
mount --bind vol$i vol$i
fi
}
echo "info: increase max open files for containers"
if ! grep -qF "OPTIONS" /etc/sysconfig/docker; then
echo 'OPTIONS="--default-ulimit nofile=1024000:1024000"' >> /etc/sysconfig/docker
fi
systemctl restart docker
'
EOF
done <<< "$(e2e::__eks_instances)"
echo "info: provider is $PROVIDER, skipped"
fi
echo "info: installing local-volume-provisioner"
$KUBECTL_BIN --context $KUBECONTEXT apply -f ${ROOT}/manifests/local-dind/local-volume-provisioner.yaml
e2e::__wait_for_ds kube-system local-volume-provisioner
}

function e2e::__eks_instances() {
aws ec2 describe-instances --filter Name=tag:eks:cluster-name,Values=$CLUSTER --query 'Reservations[*].Instances[*].{InstanceId:InstanceId,PublicDnsName:PublicDnsName}' --output text
}

function e2e::__ecr_url() {
local account_id=$(aws sts get-caller-identity --output text | awk '{print $1}')
local region=$(aws configure get region)
Expand Down Expand Up @@ -211,6 +181,13 @@ function e2e::setup_helm_server() {
$HELM_BIN version
}

# Used by non-kind providers to tag image with its id. This can force our e2e
# process to pull correct image even if IfNotPresent is used in an existing
# environment, e.g. testing in the same cluster.
function e2e::image_id_tag() {
docker image inspect -f '{{.Id}}' "$1" | cut -d ':' -f 2 | head -c 10
}

function e2e::image_load() {
local images=(
$TIDB_OPERATOR_IMAGE
Expand All @@ -226,9 +203,9 @@ function e2e::image_load() {
elif [ "$PROVIDER" == "gke" ]; then
unset DOCKER_CONFIG # We don't need this and it may be read-only and fail the command to fail
gcloud auth configure-docker
GCP_TIDB_OPERATOR_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator:$CLUSTER-$IMAGE_TAG
GCP_TIDB_BACKUP_MANAGER_IMAGE=gcr.io/$GCP_PROJECT/tidb-backup-image:$CLUSTER-$IMAGE_TAG
GCP_E2E_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator-e2e:$CLUSTER-$IMAGE_TAG
GCP_TIDB_OPERATOR_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator:$CLUSTER-$(e2e::image_id_tag $TIDB_OPERATOR_IMAGE)
GCP_TIDB_BACKUP_MANAGER_IMAGE=gcr.io/$GCP_PROJECT/tidb-backup-image:$CLUSTER-$(e2e::image_id_tag $TIDB_BACKUP_MANAGER_IMAGE)
GCP_E2E_IMAGE=gcr.io/$GCP_PROJECT/tidb-operator-e2e:$CLUSTER-$(e2e::image_id_tag $E2E_IMAGE)
docker tag $TIDB_OPERATOR_IMAGE $GCP_TIDB_OPERATOR_IMAGE
docker tag $E2E_IMAGE $GCP_E2E_IMAGE
docker tag $TIDB_BACKUP_MANAGER_IMAGE $GCP_TIDB_BACKUP_MANAGER_IMAGE
Expand All @@ -253,9 +230,9 @@ function e2e::image_load() {
local ecrURL=$(e2e::__ecr_url)
echo "info: logging in $ecrURL"
aws ecr get-login-password | docker login --username AWS --password-stdin $ecrURL
AWS_TIDB_OPERATOR_IMAGE=$ecrURL/e2e/tidb-operator:$CLUSTER-$IMAGE_TAG
AWS_TIDB_BACKUP_MANAGER_IMAGE=$ecrURL/e2e/tidb-backup-manager:$CLUSTER-$IMAGE_TAG
AWS_E2E_IMAGE=$ecrURL/e2e/tidb-operator-e2e:$CLUSTER-$IMAGE_TAG
AWS_TIDB_OPERATOR_IMAGE=$ecrURL/e2e/tidb-operator:$CLUSTER-$(e2e::image_id_tag $TIDB_OPERATOR_IMAGE)
AWS_TIDB_BACKUP_MANAGER_IMAGE=$ecrURL/e2e/tidb-backup-manager:$CLUSTER-$(e2e::image_id_tag $TIDB_BACKUP_MANAGER_IMAGE)
AWS_E2E_IMAGE=$ecrURL/e2e/tidb-operator-e2e:$CLUSTER-$(e2e::image_id_tag $E2E_IMAGE)
docker tag $TIDB_OPERATOR_IMAGE $AWS_TIDB_OPERATOR_IMAGE
docker tag $TIDB_BACKUP_MANAGER_IMAGE $AWS_TIDB_BACKUP_MANAGER_IMAGE
docker tag $E2E_IMAGE $AWS_E2E_IMAGE
Expand Down Expand Up @@ -363,16 +340,19 @@ docker_args=(
-v $KUBECONFIG:/etc/kubernetes/admin.conf:ro
--env KUBECONFIG=/etc/kubernetes/admin.conf
--env KUBECONTEXT=$KUBECONTEXT
--env KUBE_SSH_USER=$KUBE_SSH_USER
)

if [ "$PROVIDER" == "eks" ]; then
e2e_args+=(
--provider=aws
--gce-zone="${AWS_REGION}"
--gce-zone="${AWS_ZONE}" # reuse gce-zone to configure aws zone
)
# aws credential is required to get token for EKS
docker_args+=(
# aws credential is required to get token for EKS
-v $HOME/.aws:/root/.aws
# ~/.ssh/kube_aws_rsa must be mounted into e2e container to run ssh
-v $HOME/.ssh:/root/.ssh
)
elif [ "$PROVIDER" == "gke" ]; then
e2e_args+=(
Expand Down
1 change: 1 addition & 0 deletions manifests/gke/local-ssd-provision/local-ssd-provision.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ metadata:
name: local-provisioner-config
namespace: kube-system
data:
setPVOwnerRef: "true"
nodeLabelsForPV: |
- kubernetes.io/hostname
storageClassMap: |
Expand Down
1 change: 1 addition & 0 deletions manifests/local-dind/local-volume-provisioner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ metadata:
name: local-provisioner-config
namespace: kube-system
data:
setPVOwnerRef: "true"
nodeLabelsForPV: |
- kubernetes.io/hostname
storageClassMap: |
Expand Down
40 changes: 40 additions & 0 deletions pkg/pdapi/pdapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ import (

const (
DefaultTimeout = 5 * time.Second

StoreStateUp = "Up"
StoreStateOffline = "Offline"
StoreStateTombstone = "Tombstone"
)

// Namespace is a newtype of a string
Expand Down Expand Up @@ -155,6 +159,8 @@ type PDClient interface {
SetStoreLabels(storeID uint64, labels map[string]string) (bool, error)
// DeleteStore deletes a TiKV store from cluster
DeleteStore(storeID uint64) error
// SetStoreState sets store to specified state.
SetStoreState(storeID uint64, state string) error
// DeleteMember deletes a PD member from cluster
DeleteMember(name string) error
// DeleteMemberByID deletes a PD member from cluster
Expand Down Expand Up @@ -403,6 +409,30 @@ func (pc *pdClient) DeleteStore(storeID uint64) error {
return fmt.Errorf("failed to delete store %d: %v", storeID, string(body))
}

// SetStoreState sets store to specified state.
func (pc *pdClient) SetStoreState(storeID uint64, state string) error {
apiURL := fmt.Sprintf("%s/%s/%d/state?state=%s", pc.url, storePrefix, storeID, state)
req, err := http.NewRequest("POST", apiURL, nil)
if err != nil {
return err
}
res, err := pc.httpClient.Do(req)
if err != nil {
return err
}
defer httputil.DeferClose(res.Body)

if res.StatusCode == http.StatusOK || res.StatusCode == http.StatusNotFound {
return nil
}
body, err := ioutil.ReadAll(res.Body)
if err != nil {
return err
}

return fmt.Errorf("failed to delete store %d: %v", storeID, string(body))
}

func (pc *pdClient) DeleteMemberByID(memberID uint64) error {
var exist bool
members, err := pc.GetMembers()
Expand Down Expand Up @@ -666,6 +696,7 @@ const (
GetTombStoneStoresActionType ActionType = "GetTombStoneStores"
GetStoreActionType ActionType = "GetStore"
DeleteStoreActionType ActionType = "DeleteStore"
SetStoreStateActionType ActionType = "SetStoreState"
DeleteMemberByIDActionType ActionType = "DeleteMemberByID"
DeleteMemberActionType ActionType = "DeleteMember "
SetStoreLabelsActionType ActionType = "SetStoreLabels"
Expand Down Expand Up @@ -790,6 +821,15 @@ func (pc *FakePDClient) DeleteStore(id uint64) error {
return nil
}

func (pc *FakePDClient) SetStoreState(id uint64, state string) error {
if reaction, ok := pc.reactions[SetStoreStateActionType]; ok {
action := &Action{ID: id}
_, err := reaction(action)
return err
}
return nil
}

func (pc *FakePDClient) DeleteMemberByID(id uint64) error {
if reaction, ok := pc.reactions[DeleteMemberByIDActionType]; ok {
action := &Action{ID: id}
Expand Down
3 changes: 2 additions & 1 deletion tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ type OperatorConfig struct {
ValidatingEnabled bool
Cabundle string
BackupImage string
AutoFailover bool
}

type TidbClusterConfig struct {
Expand Down Expand Up @@ -408,7 +409,7 @@ func (oi *OperatorConfig) OperatorHelmSetString(m map[string]string) string {
set := map[string]string{
"operatorImage": oi.Image,
"tidbBackupManagerImage": oi.BackupImage,
"controllerManager.autoFailover": "true",
"controllerManager.autoFailover": strconv.FormatBool(oi.AutoFailover),
"scheduler.logLevel": "4",
"testMode": strconv.FormatBool(oi.TestMode),
"admissionWebhook.cabundle": oi.Cabundle,
Expand Down
9 changes: 9 additions & 0 deletions tests/e2e/e2e.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/pingcap/tidb-operator/tests"
e2econfig "github.com/pingcap/tidb-operator/tests/e2e/config"
utilimage "github.com/pingcap/tidb-operator/tests/e2e/util/image"
utilnode "github.com/pingcap/tidb-operator/tests/e2e/util/node"
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
apiextensionsclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
Expand Down Expand Up @@ -122,6 +123,14 @@ func setupSuite() {
e2elog.Logf("WARNING: Waiting for all daemonsets to be ready failed: %v", err)
}

ginkgo.By("Initializing all nodes")
nodeList, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
framework.ExpectNoError(err)
for _, node := range nodeList.Items {
framework.Logf("Initializing node %q", node.Name)
framework.ExpectNoError(utilnode.InitNode(&node))
}

// By using default storage class in GKE/EKS (aws), network attached storage
// which be used and we must clean them later.
// We set local-storage class as default for simplicity.
Expand Down
Loading

0 comments on commit 2725bfb

Please sign in to comment.