Skip to content

Commit

Permalink
etcdhttp/metrics.go: exclude alarms from health check conditionally w…
Browse files Browse the repository at this point in the history
…ith ?exclude=NOSPACE
  • Loading branch information
chaochn47 committed Apr 20, 2021
1 parent f3c5180 commit 140ea4f
Show file tree
Hide file tree
Showing 4 changed files with 182 additions and 15 deletions.
52 changes: 40 additions & 12 deletions server/etcdserver/api/etcdhttp/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ const (
// HandleMetricsHealth registers metrics and health handlers.
func HandleMetricsHealth(lg *zap.Logger, mux *http.ServeMux, srv etcdserver.ServerV2) {
mux.Handle(PathMetrics, promhttp.Handler())
mux.Handle(PathHealth, NewHealthHandler(lg, func() Health { return checkV2Health(lg, srv) }))
mux.Handle(PathHealth, NewHealthHandler(lg, func(excludedAlarms AlarmSet) Health { return checkV2Health(lg, srv, excludedAlarms) }))
}

// HandleMetricsHealthForV3 registers metrics and health handlers. it checks health by using v3 range request
// and its corresponding timeout.
func HandleMetricsHealthForV3(lg *zap.Logger, mux *http.ServeMux, srv *etcdserver.EtcdServer) {
mux.Handle(PathMetrics, promhttp.Handler())
mux.Handle(PathHealth, NewHealthHandler(lg, func() Health { return checkV3Health(lg, srv) }))
mux.Handle(PathHealth, NewHealthHandler(lg, func(excludedAlarms AlarmSet) Health { return checkV3Health(lg, srv, excludedAlarms) }))
}

// HandlePrometheus registers prometheus handler on '/metrics'.
Expand All @@ -55,15 +55,16 @@ func HandlePrometheus(mux *http.ServeMux) {
}

// NewHealthHandler handles '/health' requests.
func NewHealthHandler(lg *zap.Logger, hfunc func() Health) http.HandlerFunc {
func NewHealthHandler(lg *zap.Logger, hfunc func(excludedAlarms AlarmSet) Health) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
w.Header().Set("Allow", http.MethodGet)
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
lg.Warn("/health error", zap.Int("status-code", http.StatusMethodNotAllowed))
return
}
h := hfunc()
excludedAlarms := getExcludedAlarms(r)
h := hfunc(excludedAlarms)
defer func() {
if h.Health == "true" {
healthSuccess.Inc()
Expand Down Expand Up @@ -110,15 +111,38 @@ type Health struct {
Reason string `json:"reason"`
}

// TODO: server NOSPACE, etcdserver.ErrNoLeader in health API
type AlarmSet map[string]struct{}

func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) Health {
func getExcludedAlarms(r *http.Request) (alarms AlarmSet) {
alarms = make(map[string]struct{}, 2)
alms, found := r.URL.Query()["exclude"]
if found {
for _, alm := range alms {
if len(alms) == 0 {
continue
}
alarms[alm] = struct{}{}
}
}
return alarms
}

// TODO: etcdserver.ErrNoLeader in health API

func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2, excludedAlarms AlarmSet) Health {
h := Health{}
h.Health = "true"
as := srv.Alarms()
if len(as) > 0 {
h.Health = "false"
for _, v := range as {
alarmName := v.Alarm.String()
if _, found := excludedAlarms[alarmName]; found {
lg.Debug("/health excluded alarm", zap.String("alarm", alarmName))
delete(excludedAlarms, alarmName)
continue
}

h.Health = "false"
switch v.Alarm {
case etcdserverpb.AlarmType_NOSPACE:
h.Reason = "ALARM NOSPACE"
Expand All @@ -128,8 +152,12 @@ func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) Health {
h.Reason = "ALARM UNKNOWN"
}
lg.Warn("serving /health false due to an alarm", zap.String("alarm", v.String()))
return h
}
return h
}

if len(excludedAlarms) > 0 {
lg.Warn("fail exclude alarms from health check", zap.String("exclude alarms", fmt.Sprintf("%+v", excludedAlarms)))
}

if uint64(srv.Leader()) == raft.None {
Expand All @@ -141,8 +169,8 @@ func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) Health {
return h
}

func checkV2Health(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) {
if h = checkHealth(lg, srv); h.Health != "true" {
func checkV2Health(lg *zap.Logger, srv etcdserver.ServerV2, excludedAlarms AlarmSet) (h Health) {
if h = checkHealth(lg, srv, excludedAlarms); h.Health != "true" {
return
}
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
Expand All @@ -158,8 +186,8 @@ func checkV2Health(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) {
return
}

func checkV3Health(lg *zap.Logger, srv *etcdserver.EtcdServer) (h Health) {
if h = checkHealth(lg, srv); h.Health != "true" {
func checkV3Health(lg *zap.Logger, srv *etcdserver.EtcdServer, excludedAlarms AlarmSet) (h Health) {
if h = checkHealth(lg, srv, excludedAlarms); h.Health != "true" {
return
}
ctx, cancel := context.WithTimeout(context.Background(), srv.Cfg.ReqTimeout())
Expand Down
138 changes: 138 additions & 0 deletions server/etcdserver/api/etcdhttp/metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
package etcdhttp

import (
"context"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net/http"
"net/http/httptest"
"testing"

pb "go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/client/pkg/v3/testutil"
"go.etcd.io/etcd/client/pkg/v3/types"
"go.etcd.io/etcd/raft/v3"
"go.etcd.io/etcd/server/v3/etcdserver"
stats "go.etcd.io/etcd/server/v3/etcdserver/api/v2stats"
"go.uber.org/zap"
)

type fakeStats struct{}

func (s *fakeStats) SelfStats() []byte { return nil }
func (s *fakeStats) LeaderStats() []byte { return nil }
func (s *fakeStats) StoreStats() []byte { return nil }

type fakeServerV2 struct {
fakeServer
stats.Stats
health string
}

func (s *fakeServerV2) Leader() types.ID {
if s.health == "true" {
return 1
}
return types.ID(raft.None)
}
func (s *fakeServerV2) Do(ctx context.Context, r pb.Request) (etcdserver.Response, error) {
if s.health == "true" {
return etcdserver.Response{}, nil
}
return etcdserver.Response{}, fmt.Errorf("fail health check")
}
func (s *fakeServerV2) ClientCertAuthEnabled() bool { return false }

func TestHealthHandler(t *testing.T) {
// define the input and expected output
// input: alarms, and healthCheckURL
tests := []struct {
alarms []*pb.AlarmMember
healthCheckURL string
statusCode int
health string
}{
{
[]*pb.AlarmMember{},
"/health",
http.StatusOK,
"true",
},
{
[]*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}},
"/health",
http.StatusServiceUnavailable,
"false",
},
{
[]*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}},
"/health?exclude=NOSPACE",
http.StatusOK,
"true",
},
{
[]*pb.AlarmMember{},
"/health?exclude=NOSPACE",
http.StatusOK,
"true",
},
{
[]*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}},
"/health?exclude=NOSPACE",
http.StatusServiceUnavailable,
"false",
},
{
[]*pb.AlarmMember{{MemberID: uint64(0), Alarm: pb.AlarmType_NOSPACE}, {MemberID: uint64(1), Alarm: pb.AlarmType_CORRUPT}},
"/health?exclude=NOSPACE&exclude=CORRUPT",
http.StatusOK,
"true",
},
}

for i, tt := range tests {
func() {
mux := http.NewServeMux()
HandleMetricsHealth(zap.NewExample(), mux, &fakeServerV2{
fakeServer: fakeServer{alarms: tt.alarms},
Stats: &fakeStats{},
health: tt.health,
})
ts := httptest.NewServer(mux)
defer ts.Close()

res, err := ts.Client().Do(&http.Request{Method: http.MethodGet, URL: testutil.MustNewURL(t, ts.URL+tt.healthCheckURL)})
if err != nil {
t.Errorf("fail serve http request %s %v in test case #%d", tt.healthCheckURL, err, i+1)
}
if res == nil {
t.Errorf("got nil http response with http request %s in test case #%d", tt.healthCheckURL, i+1)
return
}
if res.StatusCode != tt.statusCode {
t.Errorf("want statusCode %d but got %d in test case #%d", tt.statusCode, res.StatusCode, i+1)
}
health, err := parseHealthOutput(res.Body)
if err != nil {
t.Errorf("fail parse health check output %v", err)
}
if health.Health != tt.health {
t.Errorf("want health %s but got %s", tt.health, health.Health)
}
}()
}
}

func parseHealthOutput(body io.Reader) (Health, error) {
obj := Health{}
d, derr := ioutil.ReadAll(body)
if derr != nil {
return obj, derr
}
if err := json.Unmarshal(d, &obj); err != nil {
return obj, err
}
return obj, nil
}
3 changes: 2 additions & 1 deletion server/etcdserver/api/etcdhttp/peer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ func (c *fakeCluster) Version() *semver.Version { return nil }

type fakeServer struct {
cluster api.Cluster
alarms []*pb.AlarmMember
}

func (s *fakeServer) AddMember(ctx context.Context, memb membership.Member) ([]*membership.Member, error) {
Expand All @@ -74,7 +75,7 @@ func (s *fakeServer) PromoteMember(ctx context.Context, id uint64) ([]*membershi
}
func (s *fakeServer) ClusterVersion() *semver.Version { return nil }
func (s *fakeServer) Cluster() api.Cluster { return s.cluster }
func (s *fakeServer) Alarms() []*pb.AlarmMember { return nil }
func (s *fakeServer) Alarms() []*pb.AlarmMember { return s.alarms }
func (s *fakeServer) LeaderChangedNotify() <-chan struct{} { return nil }

var fakeRaftHandler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Expand Down
4 changes: 2 additions & 2 deletions server/proxy/grpcproxy/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ func HandleHealth(lg *zap.Logger, mux *http.ServeMux, c *clientv3.Client) {
if lg == nil {
lg = zap.NewNop()
}
mux.Handle(etcdhttp.PathHealth, etcdhttp.NewHealthHandler(lg, func() etcdhttp.Health { return checkHealth(c) }))
mux.Handle(etcdhttp.PathHealth, etcdhttp.NewHealthHandler(lg, func(excludedAlarms etcdhttp.AlarmSet) etcdhttp.Health { return checkHealth(c) }))
}

// HandleProxyHealth registers health handler on '/proxy/health'.
func HandleProxyHealth(lg *zap.Logger, mux *http.ServeMux, c *clientv3.Client) {
if lg == nil {
lg = zap.NewNop()
}
mux.Handle(etcdhttp.PathProxyHealth, etcdhttp.NewHealthHandler(lg, func() etcdhttp.Health { return checkProxyHealth(c) }))
mux.Handle(etcdhttp.PathProxyHealth, etcdhttp.NewHealthHandler(lg, func(excludedAlarms etcdhttp.AlarmSet) etcdhttp.Health { return checkProxyHealth(c) }))
}

func checkHealth(c *clientv3.Client) etcdhttp.Health {
Expand Down

0 comments on commit 140ea4f

Please sign in to comment.