From 4bb39ade809008855f9baaea9f211ce8d81e32c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AE=87=E6=85=95?= Date: Thu, 16 May 2019 12:23:45 +0800 Subject: [PATCH] etcdserver: add learner metrics --- docs/metrics/latest | 15 +++++++++++++++ etcdserver/metrics.go | 23 +++++++++++++++++++++++ etcdserver/server.go | 14 ++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/docs/metrics/latest b/docs/metrics/latest index db3017c51059..a82e6afee909 100644 --- a/docs/metrics/latest +++ b/docs/metrics/latest @@ -533,6 +533,21 @@ etcd_server_is_leader # type: "counter" etcd_server_leader_changes_seen_total +# name: "etcd_server_is_learner" +# description: "Whether or not this member is a learner. 1 if is, 0 otherwise." +# type: "gauge" +etcd_server_is_learner + +# name: "etcd_server_learner_promote_failures" +# description: "The total number of learner promote failures (likely learner not ready)." +# type: "counter" +etcd_server_learner_promote_failures + +# name: "etcd_server_learner_promote_successes" +# description: "The total number of successful learner promotions." +# type: "counter" +etcd_server_learner_promote_successes + # name: "etcd_server_proposals_applied_total" # description: "The total number of consensus proposals applied." # type: "gauge" diff --git a/etcdserver/metrics.go b/etcdserver/metrics.go index 748e7edb5da7..93b3b53ce910 100644 --- a/etcdserver/metrics.go +++ b/etcdserver/metrics.go @@ -44,6 +44,26 @@ var ( Name: "leader_changes_seen_total", Help: "The number of leader changes seen.", }) + isLearner = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "etcd", + Subsystem: "server", + Name: "is_learner", + Help: "Whether or not this member is a learner. 1 if is, 0 otherwise.", + }) + learnerPromoteFailed = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "etcd", + Subsystem: "server", + Name: "learner_promote_failures", + Help: "The total number of learner promote failures (likely learner not ready) while this member is leader.", + }, + []string{"Reason"}, + ) + learnerPromoteSucceed = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "etcd", + Subsystem: "server", + Name: "learner_promote_successes", + Help: "The total number of successful learner promotions while this member is leader.", + }) heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "etcd", Subsystem: "server", @@ -144,6 +164,9 @@ func init() { prometheus.MustRegister(currentVersion) prometheus.MustRegister(currentGoVersion) prometheus.MustRegister(serverID) + prometheus.MustRegister(isLearner) + prometheus.MustRegister(learnerPromoteSucceed) + prometheus.MustRegister(learnerPromoteFailed) currentVersion.With(prometheus.Labels{ "server_version": version.Version, diff --git a/etcdserver/server.go b/etcdserver/server.go index 722922eeff2d..1572f779ea53 100644 --- a/etcdserver/server.go +++ b/etcdserver/server.go @@ -1646,7 +1646,12 @@ func (s *EtcdServer) PromoteMember(ctx context.Context, id uint64) ([]*membershi // fails with ErrNotLeader, forward the request to leader node via HTTP. If promoteMember call fails with error // other than ErrNotLeader, return the error. resp, err := s.promoteMember(ctx, id) + if err == nil { + learnerPromoteSucceed.Inc() + return resp, nil + } if err != ErrNotLeader { + learnerPromoteFailed.WithLabelValues(err.Error()).Inc() return resp, err } @@ -2259,6 +2264,15 @@ func (s *EtcdServer) applyConfChange(cc raftpb.ConfChange, confState *raftpb.Con } } + // update the isLearner metric when this server id is equal to the id in raft member confChange + if confChangeContext.Member.ID == s.id { + if cc.Type == raftpb.ConfChangeAddLearnerNode { + isLearner.Set(1) + } else { + isLearner.Set(0) + } + } + case raftpb.ConfChangeRemoveNode: id := types.ID(cc.NodeID) s.cluster.RemoveMember(id)