argoproj · chetan-rns · Dec 5, 2023 · Jan 3, 2024 · Jan 25, 2024 · Jan 25, 2024
diff --git a/pkg/cache/cluster.go b/pkg/cache/cluster.go
@@ -57,6 +57,9 @@ const (
 	// Limit is required to avoid memory spikes during cache initialization.
 	// The default limit of 50 is chosen based on experiments.
 	defaultListSemaphoreWeight = 50
+
+	// The default interval for monitoring the cluster connection status.
+	defaultClusterConnectionInterval = 10 * time.Second
 )
 
 const (
@@ -89,6 +92,8 @@ type ClusterInfo struct {
 	SyncError error
 	// APIResources holds list of API resources supported by the cluster
 	APIResources []kube.APIResourceInfo
+	// ConnectionStatus indicates the status of the connection with the cluster.
+	ConnectionStatus ConnectionStatus
 }
 
 // OnEventHandler is a function that handles Kubernetes event
@@ -132,6 +137,8 @@ type ClusterCache interface {
 	OnResourceUpdated(handler OnResourceUpdatedHandler) Unsubscribe
 	// OnEvent register event handler that is executed every time when new K8S event received
 	OnEvent(handler OnEventHandler) Unsubscribe
+	// StartClusterConnectionStatusMonitoring  starts a goroutine that checks the watch errors periodically and updates the cluster connection status.
+	StartClusterConnectionStatusMonitoring(ctx context.Context)
 }
 
 type WeightedSemaphore interface {
@@ -140,7 +147,7 @@ type WeightedSemaphore interface {
 	Release(n int64)
 }
 
-type ListRetryFunc func(err error) bool
+type RetryFunc func(err error) bool
 
 // NewClusterCache creates new instance of cluster cache
 func NewClusterCache(config *rest.Config, opts ...UpdateSettingsFunc) *clusterCache {
@@ -162,24 +169,48 @@ func NewClusterCache(config *rest.Config, opts ...UpdateSettingsFunc) *clusterCa
 			resyncTimeout: defaultClusterResyncTimeout,
 			syncTime:      nil,
 		},
-		watchResyncTimeout:      defaultWatchResyncTimeout,
-		clusterSyncRetryTimeout: ClusterRetryTimeout,
-		resourceUpdatedHandlers: map[uint64]OnResourceUpdatedHandler{},
-		eventHandlers:           map[uint64]OnEventHandler{},
-		log:                     log,
-		listRetryLimit:          1,
-		listRetryUseBackoff:     false,
-		listRetryFunc:           ListRetryFuncNever,
+		watchResyncTimeout:        defaultWatchResyncTimeout,
+		clusterSyncRetryTimeout:   ClusterRetryTimeout,
+		resourceUpdatedHandlers:   map[uint64]OnResourceUpdatedHandler{},
+		eventHandlers:             map[uint64]OnEventHandler{},
+		log:                       log,
+		listRetryLimit:            1,
+		listRetryUseBackoff:       false,
+		listRetryFunc:             RetryFuncNever,
+		connectionStatus:          ConnectionStatusUnknown,
+		watchFails:                newWatchFailures(),
+		clusterStatusRetryFunc:    RetryFuncNever,
+		clusterConnectionInterval: defaultClusterConnectionInterval,
 	}
 	for i := range opts {
 		opts[i](cache)
 	}
 	return cache
 }
 
+// ConnectionStatus indicates the status of the connection with the cluster.
+type ConnectionStatus string
+
+const (
+	ConnectionStatusSuccessful ConnectionStatus = "Successful"
+	ConnectionStatusFailed     ConnectionStatus = "Failed"
+	ConnectionStatusUnknown    ConnectionStatus = "Unknown"
+)
+
 type clusterCache struct {
 	syncStatus clusterCacheSync
 
+	// connectionStatus indicates the status of the connection with the cluster.
+	connectionStatus ConnectionStatus
+
+	// clusterConnectionInterval is the interval used to monitor the cluster connection status.
+	clusterConnectionInterval time.Duration
+
+	// watchFails is used to keep track of the failures while watching resources.
+	watchFails *watchFailures
+
+	clusterStatusRetryFunc RetryFunc
+
 	apisMeta      map[schema.GroupKind]*apiMeta
 	serverVersion string
 	apiResources  []kube.APIResourceInfo
@@ -200,7 +231,7 @@ type clusterCache struct {
 	// retry options for list operations
 	listRetryLimit      int32
 	listRetryUseBackoff bool
-	listRetryFunc       ListRetryFunc
+	listRetryFunc       RetryFunc
 
 	// lock is a rw lock which protects the fields of clusterInfo
 	lock      sync.RWMutex
@@ -236,13 +267,13 @@ type clusterCacheSync struct {
 	resyncTimeout time.Duration
 }
 
-// ListRetryFuncNever never retries on errors
-func ListRetryFuncNever(err error) bool {
+// RetryFuncNever never retries on errors
+func RetryFuncNever(err error) bool {
 	return false
 }
 
-// ListRetryFuncAlways always retries on errors
-func ListRetryFuncAlways(err error) bool {
+// RetryFuncAlways always retries on errors
+func RetryFuncAlways(err error) bool {
 	return true
 }
 
@@ -595,6 +626,7 @@ func (c *clusterCache) loadInitialState(ctx context.Context, api kube.APIResourc
 }
 
 func (c *clusterCache) watchEvents(ctx context.Context, api kube.APIResourceInfo, resClient dynamic.ResourceInterface, ns string, resourceVersion string) {
+	watchKey := api.GroupKind.String()
 	kube.RetryUntilSucceed(ctx, watchResourcesRetryTimeout, fmt.Sprintf("watch %s on %s", api.GroupKind, c.config.Host), c.log, func() (err error) {
 		defer func() {
 			if r := recover(); r != nil {
@@ -615,7 +647,16 @@ func (c *clusterCache) watchEvents(ctx context.Context, api kube.APIResourceInfo
 				res, err := resClient.Watch(ctx, options)
 				if errors.IsNotFound(err) {
 					c.stopWatching(api.GroupKind, ns)
+					c.watchFails.remove(watchKey)
+					return res, err
 				}
+
+				if err != nil {
+					c.watchFails.add(watchKey)
+				} else {
+					c.watchFails.remove(watchKey)
+				}
+
 				return res, err
 			},
 		})
@@ -810,8 +851,14 @@ func (c *clusterCache) sync() error {
 	version, err := c.kubectl.GetServerVersion(config)
 
 	if err != nil {
+		if c.connectionStatus != ConnectionStatusFailed {
+			c.log.Info("unable to access cluster", "cluster", c.config.Host, "reason", err.Error())
+			c.connectionStatus = ConnectionStatusFailed
+		}
 		return err
 	}
+
+	c.connectionStatus = ConnectionStatusSuccessful
 	c.serverVersion = version
 	apiResources, err := c.kubectl.GetAPIResources(config, false, NewNoopSettings())
 	if err != nil {
@@ -1186,6 +1233,7 @@ func (c *clusterCache) GetClusterInfo() ClusterInfo {
 		LastCacheSyncTime: c.syncStatus.syncTime,
 		SyncError:         c.syncStatus.syncError,
 		APIResources:      c.apiResources,
+		ConnectionStatus:  c.connectionStatus,
 	}
 }
 
@@ -1194,3 +1242,111 @@ func (c *clusterCache) GetClusterInfo() ClusterInfo {
 func skipAppRequeuing(key kube.ResourceKey) bool {
 	return ignoredRefreshResources[key.Group+"/"+key.Kind]
 }
+
+// StartClusterConnectionStatusMonitoring starts a goroutine that checks for watch failures.
+// If there are any watch errors, it will periodically ping the remote cluster
+// and update the cluster connection status.
+func (c *clusterCache) StartClusterConnectionStatusMonitoring(ctx context.Context) {
+	go c.clusterConnectionService(ctx)
+}
+
+func (c *clusterCache) clusterConnectionService(ctx context.Context) {
+	if c.clusterConnectionInterval <= 0 {
+		return
+	}
+
+	ticker := time.NewTicker(c.clusterConnectionInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ticker.C:
+			watchErrors := c.watchFails.len()
+			// Ping the cluster for connection verification if there are watch failures or
+			// if the cluster has recovered back from watch failures.
+			watchesRecovered := false
+			if watchErrors == 0 {
+				// If there are no watch failures check if the status needs to be updated.
+				c.lock.RLock()
+				if c.connectionStatus == ConnectionStatusFailed {
+					watchesRecovered = true
+				}
+				c.lock.RUnlock()
+			}
+
+			if watchErrors > 0 || watchesRecovered {
+				c.log.V(1).Info("verifying cluster connection", "server", c.config.Host)
+				// Retry fetching the server version to avoid invalidating the cache due to transient errors.
+				err := retry.OnError(retry.DefaultBackoff, c.clusterStatusRetryFunc, func() error {
+					_, err := c.kubectl.GetServerVersion(c.config)
+					if err != nil && c.clusterStatusRetryFunc(err) {
+						c.log.V(1).Info("Error while fetching server version", "error", err.Error())
+					}
+					return err
+				})
+				if err != nil {
+					c.updateConnectionStatus(ConnectionStatusFailed)
+				} else {
+					c.updateConnectionStatus(ConnectionStatusSuccessful)
+				}
+			}
+		case <-ctx.Done():
+			c.log.V(1).Info("Stopping cluster connection status monitoring", "server", c.config.Host)
+			ticker.Stop()
+			return
+		}
+	}
+
+}
+
+func (c *clusterCache) updateConnectionStatus(status ConnectionStatus) {
+	invalidateCache := false
+	c.lock.Lock()
+	if c.connectionStatus != status {
+		c.connectionStatus = status
+		invalidateCache = true
+	}
+	c.lock.Unlock()
+
+	if !invalidateCache {
+		return
+	}
+
+	c.log.V(1).Info("updated cluster connection status", "server", c.config.Host, "status", status)
+
+	c.Invalidate()
+	if err := c.EnsureSynced(); err != nil {
+		c.log.Error(err, "failed to sync cache state after updating cluster connection status", "server", c.config.Host)
+	}
+}
+
+// watchFailures is used to keep track of the failures while watching resources. It is updated
+// whenever an error occurs during watch or when the watch recovers back from a failure.
+type watchFailures struct {
+	watches map[string]bool
+	mu      sync.RWMutex
+}
+
+func newWatchFailures() *watchFailures {
+	return &watchFailures{
+		watches: make(map[string]bool),
+	}
+}
+
+func (w *watchFailures) add(key string) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	w.watches[key] = true
+}
+
+func (w *watchFailures) remove(key string) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	delete(w.watches, key)
+}
+
+func (w *watchFailures) len() int {
+	w.mu.RLock()
+	defer w.mu.RUnlock()
+	return len(w.watches)
+}