From cef5f2effeb1de8ee41602c74a20d8d57e5f450d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Nussbaumer?= Date: Tue, 12 Dec 2023 13:51:08 +0100 Subject: [PATCH 1/6] fix(shutdown): implement 5 seconds shutdown period MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit will already help prevent false positives from other kubenurse pods when trying to reach me_ingress through the ingress controller during teardown. without this 5sec wait, in-flight requests from e.g. the ingress controller will reach a pod that is already terminated. Might not be sufficient for similar for "path" errors, as there is no filter for terminating pods. Signed-off-by: Clément Nussbaumer --- internal/kubenurse/server.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/internal/kubenurse/server.go b/internal/kubenurse/server.go index 5bb586fb..91238e85 100644 --- a/internal/kubenurse/server.go +++ b/internal/kubenurse/server.go @@ -198,6 +198,12 @@ func (s *Server) Shutdown(ctx context.Context) error { s.ready = false s.mu.Unlock() + // wait 5 second before actually shutting down the http/s server, as the updated + // endpoints for the kubenurse service might not have propagated everywhere + // (other kubenurse/ingress controller) yet, which will lead to + // me_ingress or path errors in other pods + time.Sleep(5 * time.Second) + // stop the scheduled checker s.checker.StopScheduled() From e96ed6f7d3b25216121b5f89af9960576e6e47f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Nussbaumer?= Date: Tue, 12 Dec 2023 14:18:40 +0100 Subject: [PATCH 2/6] build(ci): rollout restart the daemonset to "erase" bootstrap errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit during the first start of kubenurse, if the ingress isn't ready yet or if kubenurse makes a check before a kubenurse pod is actually ready, this will result in errors in the logs and this will prevent the pipeline from working correctly. Signed-off-by: Clément Nussbaumer --- .github/workflows/ci-helm-deploy-nginx.yml | 2 ++ .github/workflows/ci-helm-deploy-traefik.yml | 2 ++ .github/workflows/ci-kustomize-deploy.yml | 2 ++ 3 files changed, 6 insertions(+) diff --git a/.github/workflows/ci-helm-deploy-nginx.yml b/.github/workflows/ci-helm-deploy-nginx.yml index ef11b24a..0a17845d 100644 --- a/.github/workflows/ci-helm-deploy-nginx.yml +++ b/.github/workflows/ci-helm-deploy-nginx.yml @@ -56,6 +56,8 @@ jobs: sleep 15 # wait for the scheduler to create pods kubectl -n kube-system wait pods -l app.kubernetes.io/name=kubenurse --for=condition=Ready kubectl -n kube-system get pods -l app.kubernetes.io/name=kubenurse + kubectl rollout restart daemonset kubenurse + kubectl rollout status daemonset kubenurse --timeout=1m sleep 60 # Wait to generate some checks etc. - name: Check deployment uses: ./.github/actions/check-deployment diff --git a/.github/workflows/ci-helm-deploy-traefik.yml b/.github/workflows/ci-helm-deploy-traefik.yml index 0e5d73e4..a27d8824 100644 --- a/.github/workflows/ci-helm-deploy-traefik.yml +++ b/.github/workflows/ci-helm-deploy-traefik.yml @@ -60,6 +60,8 @@ jobs: sleep 15 # wait for the scheduler to create pods kubectl -n kube-system wait pods -l app=kubenurse --for=condition=Ready kubectl -n kube-system get pods -l app=kubenurse + kubectl rollout restart daemonset kubenurse + kubectl rollout status daemonset kubenurse --timeout=1m sleep 60 # Wait to generate some checks etc. - name: Check deployment uses: ./.github/actions/check-deployment diff --git a/.github/workflows/ci-kustomize-deploy.yml b/.github/workflows/ci-kustomize-deploy.yml index 47cef8ff..e90a126f 100644 --- a/.github/workflows/ci-kustomize-deploy.yml +++ b/.github/workflows/ci-kustomize-deploy.yml @@ -52,6 +52,8 @@ jobs: sleep 15 # wait for the scheduler to create pods kubectl wait pods -l app.kubernetes.io/name=kubenurse --for=condition=Ready kubectl get pods -l app.kubernetes.io/name=kubenurse + kubectl rollout restart daemonset kubenurse + kubectl rollout status daemonset kubenurse --timeout=1m sleep 60 # Wait to generate some checks etc. - name: Check deployment uses: ./.github/actions/check-deployment From 65ee3ece3f306eac32cc34a456dc1464f1f3d44b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Nussbaumer?= Date: Tue, 12 Dec 2023 15:57:32 +0100 Subject: [PATCH 3/6] chore: fix linting errors and update golangci-lint config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Clément Nussbaumer --- .github/workflows/lint.yml | 2 +- .golangci.yml | 8 ++------ internal/servicecheck/servicecheck.go | 2 +- internal/servicecheck/transport.go | 2 +- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b51a8cb7..543083a3 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -10,7 +10,7 @@ jobs: - uses: actions/checkout@v4 - uses: golangci/golangci-lint-action@v3 with: - version: v1.52 + version: v1.55 args: --timeout 5m lint-helm: runs-on: ubuntu-latest diff --git a/.golangci.yml b/.golangci.yml index ada0007f..55314a8a 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -42,13 +42,10 @@ linters-settings: - gocognit - funlen - gocyclo - linters: disable-all: true enable: - bodyclose - - deadcode - - depguard - dogsled - dupl - errcheck @@ -71,19 +68,18 @@ linters: - misspell - nakedret - prealloc + - protogetter - rowserrcheck - exportloopref - staticcheck - - structcheck - stylecheck + - sqlclosecheck - typecheck - unconvert - unparam - unused - - varcheck - whitespace - wsl -issues: exclude: # Very commonly not checked. - 'Error return value of .(l.Sync|.*Close|.*.Write|.*Flush|os\.Remove(All)?|os\.(Un)?Setenv). is not checked' diff --git a/internal/servicecheck/servicecheck.go b/internal/servicecheck/servicecheck.go index b285553c..5293a2ec 100644 --- a/internal/servicecheck/servicecheck.go +++ b/internal/servicecheck/servicecheck.go @@ -170,7 +170,7 @@ func (c *Checker) MeIngress() (string, error) { return skippedStr, nil } - return c.doRequest(c.KubenurseIngressURL + "/alwayshappy") + return c.doRequest(c.KubenurseIngressURL + "/alwayshappy") //nolint:goconst // readability } // MeService checks if the kubenurse is reachable at the /alwayshappy endpoint through the kubernetes service diff --git a/internal/servicecheck/transport.go b/internal/servicecheck/transport.go index cc665ee5..9bf8ef19 100644 --- a/internal/servicecheck/transport.go +++ b/internal/servicecheck/transport.go @@ -66,7 +66,7 @@ func generateRoundTripper(extraCA string, insecure bool) (http.RoundTripper, err // Append extra CA, if set if extraCA != "" { - caCert, err := os.ReadFile(extraCA) //nolint:gosec // Intentionally included by the user. + caCert, err := os.ReadFile(extraCA) // Intentionally included by the user. if err != nil { return nil, fmt.Errorf("could not load certificate %s: %w", extraCA, err) } From 3d6050c652bd6115a8be7f7879fa316c4bbad8d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Nussbaumer?= Date: Tue, 12 Dec 2023 16:29:22 +0100 Subject: [PATCH 4/6] fix(shutdown): stop querying pending/terminating neighbors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit prevents false positive path_error when checks are made to pending or terminating pods Signed-off-by: Clément Nussbaumer --- internal/kubediscovery/kubediscovery.go | 7 +++++-- internal/servicecheck/servicecheck.go | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/internal/kubediscovery/kubediscovery.go b/internal/kubediscovery/kubediscovery.go index fe1483da..c36af220 100644 --- a/internal/kubediscovery/kubediscovery.go +++ b/internal/kubediscovery/kubediscovery.go @@ -5,6 +5,7 @@ import ( "context" "fmt" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" @@ -35,7 +36,8 @@ type Neighbour struct { HostIP string NodeName string NodeSchedulable NodeSchedulability - Phase string // Pod Phase + Phase v1.PodPhase + Terminating bool } // New creates a new kubediscovery client. The context is used to stop the k8s watchers/informers. @@ -92,8 +94,9 @@ func (c *Client) GetNeighbours(ctx context.Context, namespace, labelSelector str PodName: pod.Name, PodIP: pod.Status.PodIP, HostIP: pod.Status.HostIP, - Phase: string(pod.Status.Phase), + Phase: pod.Status.Phase, NodeName: pod.Spec.NodeName, + Terminating: pod.DeletionTimestamp != nil, NodeSchedulable: sched, } neighbours[idx] = n diff --git a/internal/servicecheck/servicecheck.go b/internal/servicecheck/servicecheck.go index 5293a2ec..d1c8d7ef 100644 --- a/internal/servicecheck/servicecheck.go +++ b/internal/servicecheck/servicecheck.go @@ -11,6 +11,7 @@ import ( "github.com/postfinance/kubenurse/internal/kubediscovery" "github.com/prometheus/client_golang/prometheus" + v1 "k8s.io/api/core/v1" ) const ( @@ -186,8 +187,10 @@ func (c *Checker) MeService() (string, error) { // which are not schedulable are excluded from this check to avoid possible false errors. func (c *Checker) checkNeighbours(nh []kubediscovery.Neighbour) { for _, neighbour := range nh { - neighbour := neighbour // pin - if c.allowUnschedulable || neighbour.NodeSchedulable == kubediscovery.NodeSchedulable { + neighbour := neighbour // pin + if neighbour.Phase == v1.PodRunning && // only query running pods (excludes pending ones) + !neighbour.Terminating && // exclude terminating pods + (c.allowUnschedulable || neighbour.NodeSchedulable == kubediscovery.NodeSchedulable) { check := func() (string, error) { if c.UseTLS { return c.doRequest("https://" + neighbour.PodIP + ":8443/alwayshappy") From a9d101a4fc3607ebd914b462776b43462d354e0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Nussbaumer?= Date: Thu, 14 Dec 2023 10:17:05 +0100 Subject: [PATCH 5/6] fix(shutdown): make shutdown duration configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Clément Nussbaumer --- internal/kubenurse/server.go | 17 +++++++++++++++-- internal/servicecheck/types.go | 3 +++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/internal/kubenurse/server.go b/internal/kubenurse/server.go index 91238e85..a77417b5 100644 --- a/internal/kubenurse/server.go +++ b/internal/kubenurse/server.go @@ -46,6 +46,7 @@ type Server struct { // * KUBERNETES_SERVICE_PORT // * KUBENURSE_NAMESPACE // * KUBENURSE_NEIGHBOUR_FILTER +// * KUBENURSE_SHUTDOWN_DURATION // * KUBENURSE_CHECK_API_SERVER_DIRECT // * KUBENURSE_CHECK_API_SERVER_DNS // * KUBENURSE_CHECK_ME_INGRESS @@ -107,12 +108,24 @@ func New(ctx context.Context, k8s kubernetes.Interface) (*Server, error) { return nil, err } + shutdownDuration := 5 * time.Second + + if v, ok := os.LookupEnv("KUBENURSE_SHUTDOWN_DURATION"); ok { + var err error + shutdownDuration, err = time.ParseDuration(v) + + if err != nil { + return nil, err + } + } + chk.KubenurseIngressURL = os.Getenv("KUBENURSE_INGRESS_URL") chk.KubenurseServiceURL = os.Getenv("KUBENURSE_SERVICE_URL") chk.KubernetesServiceHost = os.Getenv("KUBERNETES_SERVICE_HOST") chk.KubernetesServicePort = os.Getenv("KUBERNETES_SERVICE_PORT") chk.KubenurseNamespace = os.Getenv("KUBENURSE_NAMESPACE") chk.NeighbourFilter = os.Getenv("KUBENURSE_NEIGHBOUR_FILTER") + chk.ShutdownDuration = shutdownDuration //nolint:goconst // No need to make "false" a constant in my opinion, readability is better like this. chk.SkipCheckAPIServerDirect = os.Getenv("KUBENURSE_CHECK_API_SERVER_DIRECT") == "false" @@ -198,11 +211,11 @@ func (s *Server) Shutdown(ctx context.Context) error { s.ready = false s.mu.Unlock() - // wait 5 second before actually shutting down the http/s server, as the updated + // wait before actually shutting down the http/s server, as the updated // endpoints for the kubenurse service might not have propagated everywhere // (other kubenurse/ingress controller) yet, which will lead to // me_ingress or path errors in other pods - time.Sleep(5 * time.Second) + time.Sleep(s.checker.ShutdownDuration) // stop the scheduled checker s.checker.StopScheduled() diff --git a/internal/servicecheck/types.go b/internal/servicecheck/types.go index 45bfcf24..d08b1e14 100644 --- a/internal/servicecheck/types.go +++ b/internal/servicecheck/types.go @@ -16,6 +16,9 @@ type Checker struct { SkipCheckMeIngress bool SkipCheckMeService bool + // shutdownDuration defines the time during which kubenurse will wait before stopping + ShutdownDuration time.Duration + // Kubernetes API KubernetesServiceHost string KubernetesServicePort string From 461bda530444775647846390338005adc1c96d43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Nussbaumer?= Date: Thu, 14 Dec 2023 10:17:48 +0100 Subject: [PATCH 6/6] build(dockerfile): update misconfigured maintainer label MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Clément Nussbaumer --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a2d8b9eb..fa2905e2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ FROM alpine:latest -MAINTAINER OpenSource PF +LABEL OpenSource="PF " RUN apk --no-cache add ca-certificates curl COPY kubenurse /bin/kubenurse