From 641d6f86627d84e013ae5a59c5b0f0a0dc5a54db Mon Sep 17 00:00:00 2001 From: Mengxin Liu Date: Mon, 30 Dec 2019 12:15:24 +0800 Subject: [PATCH] pinger: add port binds check between local ovs and ovn-sb When ovn-controller is busy or some data lost in ovn-nb, the port bindings between local ovs and ovn-nb will mismatch. We are not sure how it happens now, but by providing metrics we can timely notice the problem and further investigate the issues. (cherry picked from commit 3838a46d146148bd8b9d7b7c0cc914f984a8ab84) --- pkg/pinger/metrics.go | 9 +++++ pkg/pinger/ping.go | 94 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/pkg/pinger/metrics.go b/pkg/pinger/metrics.go index 0580b1adf47..c9e29a6163f 100644 --- a/pkg/pinger/metrics.go +++ b/pkg/pinger/metrics.go @@ -35,6 +35,14 @@ var ( []string{ "nodeName", }) + inconsistentPortBindingGauge = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "pinger_inconsistent_port_binding", + Help: "The number of mismatch port bindings between ovs and ovn-sb", + }, + []string{ + "nodeName", + }) apiserverHealthyGauge = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Name: "pinger_apiserver_healthy", @@ -164,6 +172,7 @@ func init() { prometheus.MustRegister(ovsDownGauge) prometheus.MustRegister(ovnControllerUpGauge) prometheus.MustRegister(ovnControllerDownGauge) + prometheus.MustRegister(inconsistentPortBindingGauge) prometheus.MustRegister(apiserverHealthyGauge) prometheus.MustRegister(apiserverUnhealthyGauge) prometheus.MustRegister(apiserverRequestLatencyHistogram) diff --git a/pkg/pinger/ping.go b/pkg/pinger/ping.go index 2f704729cb3..9538231e53e 100644 --- a/pkg/pinger/ping.go +++ b/pkg/pinger/ping.go @@ -1,6 +1,9 @@ package pinger import ( + "context" + "fmt" + "github.com/alauda/kube-ovn/pkg/util" goping "github.com/sparrc/go-ping" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -8,7 +11,9 @@ import ( "k8s.io/klog" "math" "net" + "os" "os/exec" + "strings" "time" ) @@ -16,6 +21,7 @@ func StartPinger(config *Configuration) { for { checkOvs(config) checkOvnController(config) + checkPortBindings(config) checkApiServer(config) ping(config) if config.Mode != "server" { @@ -145,7 +151,10 @@ func pingExternal(config *Configuration) { func nslookup(config *Configuration) { klog.Infof("start to check dns connectivity") t1 := time.Now() - addrs, err := net.LookupHost(config.DNS) + ctx, cancel := context.WithTimeout(context.TODO(), 10 * time.Second) + defer cancel() + var r net.Resolver + addrs, err := r.LookupHost(ctx, config.DNS) elpased := time.Since(t1) if err != nil { klog.Errorf("failed to resolve dns %s, %v", config.DNS, err) @@ -193,3 +202,86 @@ func checkApiServer(config *Configuration) { SetApiserverHealthyMetrics(config.NodeName, float64(elpased)/float64(time.Millisecond)) return } + +func checkPortBindings(config *Configuration) error { + klog.Infof("start to check por binding") + ovsBindings, err := checkOvsBindings() + if err != nil { + return err + } + + sbBindings, err := checkSBBindings(config) + if err != nil { + return err + } + klog.Infof("port in sb is %v", sbBindings) + misMatch := []string{} + for _, port := range ovsBindings { + if !util.IsStringIn(port, sbBindings) { + misMatch = append(misMatch, port) + } + } + if len(misMatch) > 0 { + klog.Errorf("%d port %v not exist in sb-bindings", len(misMatch), misMatch) + inconsistentPortBindingGauge.WithLabelValues(config.NodeName).Set(float64(len(misMatch))) + } else { + klog.Infof("ovs and ovn-sb binding check passed") + inconsistentPortBindingGauge.WithLabelValues(config.NodeName).Set(0) + } + return nil +} + +func checkOvsBindings() ([]string, error) { + output, err := exec.Command("ovs-vsctl", "--no-heading", "--data=bare", "--format=csv", "--columns=external_ids", "find", "interface", "external_ids:iface-id!=\"\"").CombinedOutput() + if err != nil { + klog.Errorf("failed to get ovs interface %v", err) + return nil, err + } + result := make([]string, 0, len(strings.Split(string(output), "\n"))) + for _, line := range strings.Split(string(output), "\n") { + result = append(result, strings.TrimPrefix(line, "iface-id=")) + } + return result, nil +} + +func checkSBBindings(config *Configuration) ([]string, error) { + sbHost := os.Getenv("OVN_SB_SERVICE_HOST") + sbPort := os.Getenv("OVN_SB_SERVICE_PORT") + output, err := exec.Command( + "ovn-sbctl", + fmt.Sprintf("--db=tcp:%s:%s", sbHost, sbPort), + "--format=csv", + "--no-heading", + "--data=bare", + "--columns=_uuid", + "find", + "chassis", + fmt.Sprintf("hostname=%s", config.NodeName)).CombinedOutput() + if err != nil { + klog.Errorf("failed to find chassis %v", err) + return nil, err + } + if len(output) == 0 { + klog.Errorf("chassis for node %s not exist", config.NodeName) + return nil, fmt.Errorf("chassis for node %s not exist", config.NodeName) + } + + chassis := strings.TrimSpace(string(output)) + klog.Infof("chassis id is %s", chassis) + output, err = exec.Command( + "ovn-sbctl", + fmt.Sprintf("--db=tcp:%s:%s", sbHost, sbPort), + "--format=csv", + "--no-heading", + "--data=bare", + "--columns=logical_port", + "find", + "port_binding", + fmt.Sprintf("chassis=%s", chassis)).CombinedOutput() + if err != nil { + klog.Errorf("failed to list port_binding in ovn-sb %v", err) + return nil, err + } + + return strings.Split(string(output), "\n"), nil +}