Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add e2e test for upgrading from 1.0.x #2145

Merged
merged 11 commits into from
Apr 10, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ func (oa *operatorActions) InstallCRDOrDie(info *OperatorConfig) {
func (oa *operatorActions) DeployOperator(info *OperatorConfig) error {
klog.Infof("deploying tidb-operator %s", info.ReleaseName)

if info.Tag != "e2e" {
if info.Tag != "e2e" && !strings.HasPrefix(info.Tag, "v1.0.") {
Yisaer marked this conversation as resolved.
Show resolved Hide resolved
if err := oa.cloneOperatorRepo(); err != nil {
return err
}
Expand Down
118 changes: 118 additions & 0 deletions tests/e2e/tidbcluster/serial.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"encoding/json"
"fmt"
_ "net/http/pprof"
"os"
"strconv"
"time"

Expand Down Expand Up @@ -1008,4 +1009,121 @@ var _ = ginkgo.Describe("[tidb-operator][Serial]", func() {
klog.Info("success to check auto scale-in tidb to 2 replicas")
})
})

ginkgo.Context("[Verify: Upgrading Operator from 1.0.6", func() {
var oa tests.OperatorActions
var ocfg *tests.OperatorConfig
var version string

ginkgo.BeforeEach(func() {
version = os.Getenv("RELEASED_VERSION")
Yisaer marked this conversation as resolved.
Show resolved Hide resolved
version = "v1.0.6"
ocfg = &tests.OperatorConfig{
Namespace: ns,
ReleaseName: "operator",
Tag: version,
Image: fmt.Sprintf("pingcap/tidb-operator:%s", version),
}
oa = tests.NewOperatorActions(cli, c, asCli, aggrCli, apiExtCli, tests.DefaultPollInterval, ocfg, e2econfig.TestConfig, nil, fw, f)
ginkgo.By("Installing CRDs")
oa.CleanCRDOrDie()
tests.DeployReleasedCRDOrDie(version)
ginkgo.By("Installing tidb-operator")
oa.CleanOperatorOrDie(ocfg)
oa.DeployOperatorOrDie(ocfg)
})

ginkgo.AfterEach(func() {
ginkgo.By("Uninstall tidb-operator")
oa.CleanOperatorOrDie(ocfg)
ginkgo.By("Uninstalling CRDs")
tests.CleanReleasedCRDOrDie(version)
})

ginkgo.It("Deploy TidbCluster and Upgrade Operator", func() {
tcName := "tidbcluster"
cluster := newTidbClusterConfig(e2econfig.TestConfig, ns, tcName, "", "")
cluster.Resources["pd.replicas"] = "3"
cluster.Resources["tikv.replicas"] = "3"
cluster.Resources["tidb.replicas"] = "2"
cluster.Monitor = false
cluster.OperatorTag = version
oa.DeployTidbClusterOrDie(&cluster)
oa.CheckTidbClusterStatusOrDie(&cluster)

getPods := func(ls string) ([]v1.Pod, error) {
listOptions := metav1.ListOptions{
LabelSelector: ls,
}
podList, err := c.CoreV1().Pods(ns).List(listOptions)
if err != nil {
return nil, err
}
return podList.Items, nil
}

tc, err := cli.PingcapV1alpha1().TidbClusters(ns).Get(tcName, metav1.GetOptions{})
framework.ExpectNoError(err, "failed to get tc")

pdPods, err := getPods(labels.SelectorFromSet(label.New().Instance(tcName).PD().Labels()).String())
framework.ExpectNoError(err, "failed to get pd pods")

tikvPods, err := getPods(labels.SelectorFromSet(label.New().Instance(tcName).TiKV().Labels()).String())
framework.ExpectNoError(err, "failed to get tikv pods")

tidbPods, err := getPods(labels.SelectorFromSet(label.New().Instance(tcName).TiDB().Labels()).String())
framework.ExpectNoError(err, "failed to get tidb pods")

// Upgrade CRD / Operator to current version
ocfg.Tag = cfg.OperatorTag
ocfg.Image = cfg.OperatorImage
oa.InstallCRDOrDie(ocfg)
oa.UpgradeOperatorOrDie(ocfg)
err = wait.Poll(5*time.Second, 10*time.Minute, func() (done bool, err error) {

newTc, err := cli.PingcapV1alpha1().TidbClusters(ns).Get(tcName, metav1.GetOptions{})
if err != nil {
return false, nil
}
// wait tidb to be updated
if tc.Status.TiDB.StatefulSet.CurrentRevision == newTc.Status.TiDB.StatefulSet.CurrentRevision {
return false, nil
}
// wait tidb finish updating
if newTc.Status.TiDB.StatefulSet.CurrentRevision != newTc.Status.TiDB.StatefulSet.UpdateRevision {
return false, nil
}

// confirm the tidb pod have been changed
changed, err := utilpod.PodsAreChanged(c, tidbPods)()
if err != nil {
return false, nil
}
if !changed {
return false, fmt.Errorf("tidb should be updated after operator upgrading")
}

// confirm the pd Pod haven't been changed
changed, err = utilpod.PodsAreChanged(c, pdPods)()
if err != nil {
return false, nil
}
if changed {
return false, fmt.Errorf("pd replicas has changed after upgrading operator")
Yisaer marked this conversation as resolved.
Show resolved Hide resolved
}

// confirm the pd tikv haven't been changed
changed, err = utilpod.PodsAreChanged(c, tikvPods)()
if err != nil {
return false, nil
}
if changed {
return false, fmt.Errorf("tikv pods have been changed after upgrading operator")
}

return true, nil
})
framework.ExpectNoError(err, "Failed to check TidbCluster Status After Upgrading Operator")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should wait for pd/tikv pods are not changed in a certain of time, the expected err should be wait.ErrWaitTimeout, example:

err = wait.PollImmediate(time.Second*30, time.Minute*5, func() (bool, error) {
var ok bool
var err error
framework.Logf("check whether pods of cluster %q are changed", clusterName)
ok, err = utilpod.PodsAreChanged(c, podList.Items)()
if ok || err != nil {
// pod changed or some error happened
return true, err
}
framework.Logf("check whether pods of cluster %q are running", clusterName)
newPodList, err := c.CoreV1().Pods(ns).List(listOptions)
if err != nil {
return false, err
}
for _, pod := range newPodList.Items {
if pod.Status.Phase != v1.PodRunning {
return false, fmt.Errorf("pod %s/%s is not running", pod.Namespace, pod.Name)
}
}
framework.Logf("check whehter tidb cluster %q is connectable", clusterName)
ok, err = utiltidb.TiDBIsConnectable(fw, ns, clusterName, "root", "")()
if !ok || err != nil {
// not connectable or some error happened
return true, err
}
return false, nil
})
framework.ExpectEqual(err, wait.ErrWaitTimeout, "TiDB cluster is not affeteced")

10*time.Minute can be changed to 5*time.Minute. 5 minutes is enough to make sure our logic is correct if pd/tikv pods are not affected in this time.

before checking pd/tikv pods are not affected, you can wait for tidb pods are affected. in this checking, what you expect is nil err (return true, nil when the tidb pods are different).

Copy link
Contributor Author

@Yisaer Yisaer Apr 9, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the current logic is like what you said. After upgrading Operator, we would expect tidb pods updated yet first, then we expect the tikv and pd pods haven't been changed. The whole process may need 5 or 10 minutes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you didn't check pd/tikv pods are not affected in a certain time. after tidb pods are changed, the wait function checks pd/pods once and return true, nil if they are not changed. what we expect is they are not changed in a certain time (e.g. 5 minutes).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can have two wait.Poll functions:

  1. the first waits for the tidb pods are changed
  2. the second waits for the pd/tikv pods are not changed in 5 minutes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good idea, updated.

})
})
})
17 changes: 17 additions & 0 deletions tests/images/e2e/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ FROM debian:buster-slim

ENV KUBECTL_VERSION=v1.12.2
ENV HELM_VERSION=v2.9.1
ENV RELEASED_VERSION=v1.0.6
Yisaer marked this conversation as resolved.
Show resolved Hide resolved
Yisaer marked this conversation as resolved.
Show resolved Hide resolved

RUN apt-get update && \
apt-get install -y ca-certificates curl git openssl default-mysql-client unzip
Expand Down Expand Up @@ -29,5 +30,21 @@ ADD bin/webhook /usr/local/bin/
ADD bin/blockwriter /usr/local/bin/
ADD bin/apiserver /usr/local/bin/

RUN mkdir /charts/${RELEASED_VERSION}

RUN curl -L https://github.com/pingcap/tidb-operator/releases/download/${RELEASED_VERSION}/tidb-operator-chart-${RELEASED_VERSION}.tgz \
-o tidb-operator-chart-${RELEASED_VERSION}.tgz && \
tar -zxvf tidb-operator-chart-${RELEASED_VERSION}.tgz && \
mv tidb-operator /charts/${RELEASED_VERSION}/tidb-operator && \
rm -rf tidb-operator && \
rm tidb-operator-chart-${RELEASED_VERSION}.tgz

RUN curl -L https://github.com/pingcap/tidb-operator/releases/download/${RELEASED_VERSION}/tidb-cluster-chart-${RELEASED_VERSION}.tgz \
-o tidb-cluster-chart-${RELEASED_VERSION}.tgz && \
tar -zxvf tidb-cluster-chart-${RELEASED_VERSION}.tgz && \
mv tidb-cluster /charts/${RELEASED_VERSION}/tidb-cluster && \
rm -rf tidb-cluster && \
rm tidb-cluster-chart-${RELEASED_VERSION}.tgz
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

download these charts in Upgrading Operator from 1.0.6 test

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated, using git clone now.


ADD entrypoint.sh /usr/local/bin
ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
21 changes: 20 additions & 1 deletion tests/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,18 @@ import (
"bytes"
"fmt"
"math/rand"
"os/exec"
"text/template"
"time"

"github.com/pingcap/tidb-operator/tests/slack"

corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog"
)

// Keep will keep the fun running in the period, otherwise the fun return error
Expand Down Expand Up @@ -299,3 +300,21 @@ func waitForComponentStatus(c kubernetes.Interface, component string, statusType
func IntPtr(i int) *int {
return &i
}

func DeployReleasedCRDOrDie(version string) {
cmd := fmt.Sprintf(`kubectl apply -f https://raw.githubusercontent.com/pingcap/tidb-operator/%s/manifests/crd.yaml`, version)
klog.Info(cmd)
res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput()
if err != nil {
klog.Fatalf(fmt.Sprintf("failed to deploy crd: %v, %s", err, string(res)))
}
}

func CleanReleasedCRDOrDie(version string) {
cmd := fmt.Sprintf(`kubectl delete -f https://raw.githubusercontent.com/pingcap/tidb-operator/%s/manifests/crd.yaml`, version)
klog.Info(cmd)
res, err := exec.Command("/bin/sh", "-c", cmd).CombinedOutput()
if err != nil {
klog.Fatalf(fmt.Sprintf("failed to clean crd: %v, %s", err, string(res)))
}
}