From 554fd6c742e40e6c3ea8648a926356cac014ecf3 Mon Sep 17 00:00:00 2001 From: Song Gao <2695690803@qq.com> Date: Wed, 27 May 2020 13:07:54 +0800 Subject: [PATCH 1/6] fix pd failover --- .../tidbcluster/tidb_cluster_controller.go | 2 +- pkg/manager/member/pd_failover.go | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pkg/controller/tidbcluster/tidb_cluster_controller.go b/pkg/controller/tidbcluster/tidb_cluster_controller.go index 3b5d0ec24a..e05817cf89 100644 --- a/pkg/controller/tidbcluster/tidb_cluster_controller.go +++ b/pkg/controller/tidbcluster/tidb_cluster_controller.go @@ -107,7 +107,7 @@ func NewController( pdScaler := mm.NewPDScaler(pdControl, pvcInformer.Lister(), pvcControl) tikvScaler := mm.NewTiKVScaler(pdControl, pvcInformer.Lister(), pvcControl, podInformer.Lister()) tiflashScaler := mm.NewTiFlashScaler(pdControl, pvcInformer.Lister(), pvcControl, podInformer.Lister()) - pdFailover := mm.NewPDFailover(cli, pdControl, pdFailoverPeriod, podInformer.Lister(), podControl, pvcInformer.Lister(), pvcControl, pvInformer.Lister(), recorder) + pdFailover := mm.NewPDFailover(cli, pdControl, pdFailoverPeriod, podInformer.Lister(), podControl, pvcInformer.Lister(), pvcControl, pvInformer.Lister(), setInformer.Lister(), recorder) tikvFailover := mm.NewTiKVFailover(tikvFailoverPeriod, recorder) tidbFailover := mm.NewTiDBFailover(tidbFailoverPeriod, recorder, podInformer.Lister()) tiflashFailover := mm.NewTiFlashFailover(tiflashFailoverPeriod, recorder) diff --git a/pkg/manager/member/pd_failover.go b/pkg/manager/member/pd_failover.go index df06f44e87..44953c1921 100644 --- a/pkg/manager/member/pd_failover.go +++ b/pkg/manager/member/pd_failover.go @@ -18,6 +18,7 @@ import ( "strconv" "time" + "github.com/pingcap/advanced-statefulset/client/apis/apps/v1/helper" "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" "github.com/pingcap/tidb-operator/pkg/controller" @@ -26,6 +27,8 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" + apps "k8s.io/client-go/listers/apps/v1" corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/record" "k8s.io/klog" @@ -40,6 +43,7 @@ type pdFailover struct { pvcLister corelisters.PersistentVolumeClaimLister pvcControl controller.PVCControlInterface pvLister corelisters.PersistentVolumeLister + setLister apps.StatefulSetLister recorder record.EventRecorder } @@ -52,6 +56,7 @@ func NewPDFailover(cli versioned.Interface, pvcLister corelisters.PersistentVolumeClaimLister, pvcControl controller.PVCControlInterface, pvLister corelisters.PersistentVolumeLister, + setLister apps.StatefulSetLister, recorder record.EventRecorder) Failover { return &pdFailover{ cli, @@ -62,6 +67,7 @@ func NewPDFailover(cli versioned.Interface, pvcLister, pvcControl, pvLister, + setLister, recorder} } @@ -77,7 +83,18 @@ func (pf *pdFailover) Failover(tc *v1alpha1.TidbCluster) error { } healthCount := 0 + pdsts, err := pf.setLister.StatefulSets(ns).Get(controller.PDMemberName(tcName)) + if err != nil { + return err + } + podNames := sets.String{} + for _, ordinal := range helper.GetPodOrdinals(*pdsts.Spec.Replicas, pdsts).List() { + podNames.Insert(util.GetPodName(tc, v1alpha1.PDMemberType, ordinal)) + } for podName, pdMember := range tc.Status.PD.Members { + if !podNames.Has(podName) { + continue + } if pdMember.Health { healthCount++ } else { From 354d97ca4d381255579eb24925039fd3aaeaeaff Mon Sep 17 00:00:00 2001 From: Song Gao <2695690803@qq.com> Date: Wed, 27 May 2020 13:25:20 +0800 Subject: [PATCH 2/6] fix unit test --- pkg/manager/member/pd_failover_test.go | 29 +++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/pkg/manager/member/pd_failover_test.go b/pkg/manager/member/pd_failover_test.go index 953589830a..0b64b1d1b0 100644 --- a/pkg/manager/member/pd_failover_test.go +++ b/pkg/manager/member/pd_failover_test.go @@ -25,6 +25,7 @@ import ( "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned/fake" "github.com/pingcap/tidb-operator/pkg/controller" "github.com/pingcap/tidb-operator/pkg/pdapi" + apps "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -471,7 +472,7 @@ func TestPDFailoverFailover(t *testing.T) { tc.Spec.PD.MaxFailoverCount = pointer.Int32Ptr(test.maxFailoverCount) test.update(tc) - pdFailover, pvcIndexer, podIndexer, fakePDControl, fakePodControl, fakePVCControl := newFakePDFailover() + pdFailover, pvcIndexer, podIndexer, stsIndexer, fakePDControl, fakePodControl, fakePVCControl := newFakePDFailover() pdClient := controller.NewFakePDClient(fakePDControl, tc) pdFailover.recorder = recorder @@ -482,6 +483,8 @@ func TestPDFailoverFailover(t *testing.T) { return nil, nil }) + stsIndexer.Add(newStsForTc(tc, v1alpha1.PDMemberType)) + if test.hasPVC { pvc := newPVCForPDFailover(tc, v1alpha1.PDMemberType, 1) if test.pvcWithDeletionTimestamp { @@ -525,7 +528,7 @@ func TestPDFailoverRecovery(t *testing.T) { tc := newTidbClusterForPD() test.update(tc) - pdFailover, _, _, _, _, _ := newFakePDFailover() + pdFailover, _, _, _, _, _, _ := newFakePDFailover() pdFailover.Recover(tc) test.expectFn(tc) } @@ -614,7 +617,7 @@ func TestPDFailoverRecovery(t *testing.T) { } } -func newFakePDFailover() (*pdFailover, cache.Indexer, cache.Indexer, *pdapi.FakePDControl, *controller.FakePodControl, *controller.FakePVCControl) { +func newFakePDFailover() (*pdFailover, cache.Indexer, cache.Indexer, cache.Indexer, *pdapi.FakePDControl, *controller.FakePodControl, *controller.FakePVCControl) { cli := fake.NewSimpleClientset() kubeCli := kubefake.NewSimpleClientset() pdControl := pdapi.NewFakePDControl(kubeCli) @@ -622,6 +625,7 @@ func newFakePDFailover() (*pdFailover, cache.Indexer, cache.Indexer, *pdapi.Fake podInformer := kubeInformerFactory.Core().V1().Pods() pvcInformer := kubeInformerFactory.Core().V1().PersistentVolumeClaims() pvInformer := kubeInformerFactory.Core().V1().PersistentVolumes() + stsInformer := kubeInformerFactory.Apps().V1().StatefulSets() podControl := controller.NewFakePodControl(podInformer) pvcControl := controller.NewFakePVCControl(pvcInformer) @@ -634,9 +638,11 @@ func newFakePDFailover() (*pdFailover, cache.Indexer, cache.Indexer, *pdapi.Fake pvcInformer.Lister(), pvcControl, pvInformer.Lister(), + stsInformer.Lister(), nil}, pvcInformer.Informer().GetIndexer(), podInformer.Informer().GetIndexer(), + stsInformer.Informer().GetIndexer(), pdControl, podControl, pvcControl } @@ -756,3 +762,20 @@ func collectEvents(source <-chan string) []string { } return events } + +func newStsForTc(tc *v1alpha1.TidbCluster, memberType v1alpha1.MemberType) *apps.StatefulSet { + sts := apps.StatefulSet{} + sts.Name = fmt.Sprintf("%s-%s", tc.Name, memberType.String()) + sts.Namespace = tc.Namespace + switch memberType { + case v1alpha1.TiDBMemberType: + sts.Spec.Replicas = &tc.Spec.TiDB.Replicas + case v1alpha1.TiKVMemberType: + sts.Spec.Replicas = &tc.Spec.TiKV.Replicas + case v1alpha1.PDMemberType: + sts.Spec.Replicas = &tc.Spec.PD.Replicas + case v1alpha1.TiFlashMemberType: + sts.Spec.Replicas = &tc.Spec.TiFlash.Replicas + } + return &sts +} From bbed0469b3ffffbb5b9815ebf01c9d4ce4121321 Mon Sep 17 00:00:00 2001 From: Song Gao <2695690803@qq.com> Date: Wed, 27 May 2020 14:01:48 +0800 Subject: [PATCH 3/6] move into tryToMarkAPeerAsFailure --- pkg/manager/member/pd_failover.go | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pkg/manager/member/pd_failover.go b/pkg/manager/member/pd_failover.go index 44953c1921..d6f902b2ba 100644 --- a/pkg/manager/member/pd_failover.go +++ b/pkg/manager/member/pd_failover.go @@ -83,18 +83,7 @@ func (pf *pdFailover) Failover(tc *v1alpha1.TidbCluster) error { } healthCount := 0 - pdsts, err := pf.setLister.StatefulSets(ns).Get(controller.PDMemberName(tcName)) - if err != nil { - return err - } - podNames := sets.String{} - for _, ordinal := range helper.GetPodOrdinals(*pdsts.Spec.Replicas, pdsts).List() { - podNames.Insert(util.GetPodName(tc, v1alpha1.PDMemberType, ordinal)) - } for podName, pdMember := range tc.Status.PD.Members { - if !podNames.Has(podName) { - continue - } if pdMember.Health { healthCount++ } else { @@ -138,7 +127,20 @@ func (pf *pdFailover) tryToMarkAPeerAsFailure(tc *v1alpha1.TidbCluster) error { ns := tc.GetNamespace() tcName := tc.GetName() + pdsts, err := pf.setLister.StatefulSets(ns).Get(controller.PDMemberName(tcName)) + if err != nil { + return err + } + podNames := sets.String{} + for _, ordinal := range helper.GetPodOrdinals(*pdsts.Spec.Replicas, pdsts).List() { + podNames.Insert(util.GetPodName(tc, v1alpha1.PDMemberType, ordinal)) + } + for podName, pdMember := range tc.Status.PD.Members { + if !podNames.Has(podName) { + continue + } + if pdMember.LastTransitionTime.IsZero() { continue } From 0ac5bc83b17be0ac4cfad144c5b7cf292a674976 Mon Sep 17 00:00:00 2001 From: Song Gao <2695690803@qq.com> Date: Wed, 27 May 2020 14:19:15 +0800 Subject: [PATCH 4/6] use desired Pod function --- pkg/manager/member/pd_failover.go | 31 +++++++++++++------------- pkg/manager/member/pd_failover_test.go | 29 +++--------------------- 2 files changed, 18 insertions(+), 42 deletions(-) diff --git a/pkg/manager/member/pd_failover.go b/pkg/manager/member/pd_failover.go index d6f902b2ba..48385415d4 100644 --- a/pkg/manager/member/pd_failover.go +++ b/pkg/manager/member/pd_failover.go @@ -18,7 +18,6 @@ import ( "strconv" "time" - "github.com/pingcap/advanced-statefulset/client/apis/apps/v1/helper" "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned" "github.com/pingcap/tidb-operator/pkg/controller" @@ -27,7 +26,6 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/sets" apps "k8s.io/client-go/listers/apps/v1" corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/record" @@ -43,7 +41,6 @@ type pdFailover struct { pvcLister corelisters.PersistentVolumeClaimLister pvcControl controller.PVCControlInterface pvLister corelisters.PersistentVolumeLister - setLister apps.StatefulSetLister recorder record.EventRecorder } @@ -67,7 +64,6 @@ func NewPDFailover(cli versioned.Interface, pvcLister, pvcControl, pvLister, - setLister, recorder} } @@ -127,21 +123,14 @@ func (pf *pdFailover) tryToMarkAPeerAsFailure(tc *v1alpha1.TidbCluster) error { ns := tc.GetNamespace() tcName := tc.GetName() - pdsts, err := pf.setLister.StatefulSets(ns).Get(controller.PDMemberName(tcName)) - if err != nil { - return err - } - podNames := sets.String{} - for _, ordinal := range helper.GetPodOrdinals(*pdsts.Spec.Replicas, pdsts).List() { - podNames.Insert(util.GetPodName(tc, v1alpha1.PDMemberType, ordinal)) - } - for podName, pdMember := range tc.Status.PD.Members { - if !podNames.Has(podName) { + if pdMember.LastTransitionTime.IsZero() { continue } - - if pdMember.LastTransitionTime.IsZero() { + if !pf.isPodDesired(tc, podName) { + // we should ignore the store record of deleted pod, otherwise the + // record of deleted pod may be added back to failure stores + // (before it enters into Offline/Tombstone state) continue } @@ -276,3 +265,13 @@ func (fpf *fakePDFailover) Failover(_ *v1alpha1.TidbCluster) error { func (fpf *fakePDFailover) Recover(_ *v1alpha1.TidbCluster) { return } + +func (pf *pdFailover) isPodDesired(tc *v1alpha1.TidbCluster, podName string) bool { + ordinals := tc.PDStsDesiredOrdinals(true) + ordinal, err := util.GetOrdinalFromPodName(podName) + if err != nil { + klog.Errorf("unexpected pod name %q: %v", podName, err) + return false + } + return ordinals.Has(ordinal) +} diff --git a/pkg/manager/member/pd_failover_test.go b/pkg/manager/member/pd_failover_test.go index 0b64b1d1b0..953589830a 100644 --- a/pkg/manager/member/pd_failover_test.go +++ b/pkg/manager/member/pd_failover_test.go @@ -25,7 +25,6 @@ import ( "github.com/pingcap/tidb-operator/pkg/client/clientset/versioned/fake" "github.com/pingcap/tidb-operator/pkg/controller" "github.com/pingcap/tidb-operator/pkg/pdapi" - apps "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -472,7 +471,7 @@ func TestPDFailoverFailover(t *testing.T) { tc.Spec.PD.MaxFailoverCount = pointer.Int32Ptr(test.maxFailoverCount) test.update(tc) - pdFailover, pvcIndexer, podIndexer, stsIndexer, fakePDControl, fakePodControl, fakePVCControl := newFakePDFailover() + pdFailover, pvcIndexer, podIndexer, fakePDControl, fakePodControl, fakePVCControl := newFakePDFailover() pdClient := controller.NewFakePDClient(fakePDControl, tc) pdFailover.recorder = recorder @@ -483,8 +482,6 @@ func TestPDFailoverFailover(t *testing.T) { return nil, nil }) - stsIndexer.Add(newStsForTc(tc, v1alpha1.PDMemberType)) - if test.hasPVC { pvc := newPVCForPDFailover(tc, v1alpha1.PDMemberType, 1) if test.pvcWithDeletionTimestamp { @@ -528,7 +525,7 @@ func TestPDFailoverRecovery(t *testing.T) { tc := newTidbClusterForPD() test.update(tc) - pdFailover, _, _, _, _, _, _ := newFakePDFailover() + pdFailover, _, _, _, _, _ := newFakePDFailover() pdFailover.Recover(tc) test.expectFn(tc) } @@ -617,7 +614,7 @@ func TestPDFailoverRecovery(t *testing.T) { } } -func newFakePDFailover() (*pdFailover, cache.Indexer, cache.Indexer, cache.Indexer, *pdapi.FakePDControl, *controller.FakePodControl, *controller.FakePVCControl) { +func newFakePDFailover() (*pdFailover, cache.Indexer, cache.Indexer, *pdapi.FakePDControl, *controller.FakePodControl, *controller.FakePVCControl) { cli := fake.NewSimpleClientset() kubeCli := kubefake.NewSimpleClientset() pdControl := pdapi.NewFakePDControl(kubeCli) @@ -625,7 +622,6 @@ func newFakePDFailover() (*pdFailover, cache.Indexer, cache.Indexer, cache.Index podInformer := kubeInformerFactory.Core().V1().Pods() pvcInformer := kubeInformerFactory.Core().V1().PersistentVolumeClaims() pvInformer := kubeInformerFactory.Core().V1().PersistentVolumes() - stsInformer := kubeInformerFactory.Apps().V1().StatefulSets() podControl := controller.NewFakePodControl(podInformer) pvcControl := controller.NewFakePVCControl(pvcInformer) @@ -638,11 +634,9 @@ func newFakePDFailover() (*pdFailover, cache.Indexer, cache.Indexer, cache.Index pvcInformer.Lister(), pvcControl, pvInformer.Lister(), - stsInformer.Lister(), nil}, pvcInformer.Informer().GetIndexer(), podInformer.Informer().GetIndexer(), - stsInformer.Informer().GetIndexer(), pdControl, podControl, pvcControl } @@ -762,20 +756,3 @@ func collectEvents(source <-chan string) []string { } return events } - -func newStsForTc(tc *v1alpha1.TidbCluster, memberType v1alpha1.MemberType) *apps.StatefulSet { - sts := apps.StatefulSet{} - sts.Name = fmt.Sprintf("%s-%s", tc.Name, memberType.String()) - sts.Namespace = tc.Namespace - switch memberType { - case v1alpha1.TiDBMemberType: - sts.Spec.Replicas = &tc.Spec.TiDB.Replicas - case v1alpha1.TiKVMemberType: - sts.Spec.Replicas = &tc.Spec.TiKV.Replicas - case v1alpha1.PDMemberType: - sts.Spec.Replicas = &tc.Spec.PD.Replicas - case v1alpha1.TiFlashMemberType: - sts.Spec.Replicas = &tc.Spec.TiFlash.Replicas - } - return &sts -} From efa8599b4f38d3c778c762b844b27030d72c89e0 Mon Sep 17 00:00:00 2001 From: Song Gao <2695690803@qq.com> Date: Wed, 27 May 2020 14:47:21 +0800 Subject: [PATCH 5/6] fix new func --- pkg/controller/tidbcluster/tidb_cluster_controller.go | 2 +- pkg/manager/member/pd_failover.go | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pkg/controller/tidbcluster/tidb_cluster_controller.go b/pkg/controller/tidbcluster/tidb_cluster_controller.go index e05817cf89..3b5d0ec24a 100644 --- a/pkg/controller/tidbcluster/tidb_cluster_controller.go +++ b/pkg/controller/tidbcluster/tidb_cluster_controller.go @@ -107,7 +107,7 @@ func NewController( pdScaler := mm.NewPDScaler(pdControl, pvcInformer.Lister(), pvcControl) tikvScaler := mm.NewTiKVScaler(pdControl, pvcInformer.Lister(), pvcControl, podInformer.Lister()) tiflashScaler := mm.NewTiFlashScaler(pdControl, pvcInformer.Lister(), pvcControl, podInformer.Lister()) - pdFailover := mm.NewPDFailover(cli, pdControl, pdFailoverPeriod, podInformer.Lister(), podControl, pvcInformer.Lister(), pvcControl, pvInformer.Lister(), setInformer.Lister(), recorder) + pdFailover := mm.NewPDFailover(cli, pdControl, pdFailoverPeriod, podInformer.Lister(), podControl, pvcInformer.Lister(), pvcControl, pvInformer.Lister(), recorder) tikvFailover := mm.NewTiKVFailover(tikvFailoverPeriod, recorder) tidbFailover := mm.NewTiDBFailover(tidbFailoverPeriod, recorder, podInformer.Lister()) tiflashFailover := mm.NewTiFlashFailover(tiflashFailoverPeriod, recorder) diff --git a/pkg/manager/member/pd_failover.go b/pkg/manager/member/pd_failover.go index 48385415d4..e33501a3fe 100644 --- a/pkg/manager/member/pd_failover.go +++ b/pkg/manager/member/pd_failover.go @@ -26,7 +26,6 @@ import ( apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - apps "k8s.io/client-go/listers/apps/v1" corelisters "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/tools/record" "k8s.io/klog" @@ -53,7 +52,6 @@ func NewPDFailover(cli versioned.Interface, pvcLister corelisters.PersistentVolumeClaimLister, pvcControl controller.PVCControlInterface, pvLister corelisters.PersistentVolumeLister, - setLister apps.StatefulSetLister, recorder record.EventRecorder) Failover { return &pdFailover{ cli, From 9fe80c4a8b8d7e8ec2593f9a0c65e957ce700596 Mon Sep 17 00:00:00 2001 From: Song Gao Date: Wed, 27 May 2020 16:18:48 +0800 Subject: [PATCH 6/6] Update pkg/manager/member/pd_failover.go Co-authored-by: Yecheng Fu --- pkg/manager/member/pd_failover.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/manager/member/pd_failover.go b/pkg/manager/member/pd_failover.go index e33501a3fe..6c7d7349c8 100644 --- a/pkg/manager/member/pd_failover.go +++ b/pkg/manager/member/pd_failover.go @@ -126,9 +126,6 @@ func (pf *pdFailover) tryToMarkAPeerAsFailure(tc *v1alpha1.TidbCluster) error { continue } if !pf.isPodDesired(tc, podName) { - // we should ignore the store record of deleted pod, otherwise the - // record of deleted pod may be added back to failure stores - // (before it enters into Offline/Tombstone state) continue }