Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: detect and report cluster connection errors #559

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 170 additions & 14 deletions pkg/cache/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ const (
// Limit is required to avoid memory spikes during cache initialization.
// The default limit of 50 is chosen based on experiments.
defaultListSemaphoreWeight = 50

// The default interval for monitoring the cluster connection status.
defaultClusterConnectionInterval = 10 * time.Second
)

const (
Expand Down Expand Up @@ -89,6 +92,8 @@ type ClusterInfo struct {
SyncError error
// APIResources holds list of API resources supported by the cluster
APIResources []kube.APIResourceInfo
// ConnectionStatus indicates the status of the connection with the cluster.
ConnectionStatus ConnectionStatus
}

// OnEventHandler is a function that handles Kubernetes event
Expand Down Expand Up @@ -132,6 +137,8 @@ type ClusterCache interface {
OnResourceUpdated(handler OnResourceUpdatedHandler) Unsubscribe
// OnEvent register event handler that is executed every time when new K8S event received
OnEvent(handler OnEventHandler) Unsubscribe
// StartClusterConnectionStatusMonitoring starts a goroutine that checks the watch errors periodically and updates the cluster connection status.
StartClusterConnectionStatusMonitoring(ctx context.Context)
}

type WeightedSemaphore interface {
Expand All @@ -140,7 +147,7 @@ type WeightedSemaphore interface {
Release(n int64)
}

type ListRetryFunc func(err error) bool
type RetryFunc func(err error) bool

// NewClusterCache creates new instance of cluster cache
func NewClusterCache(config *rest.Config, opts ...UpdateSettingsFunc) *clusterCache {
Expand All @@ -162,24 +169,48 @@ func NewClusterCache(config *rest.Config, opts ...UpdateSettingsFunc) *clusterCa
resyncTimeout: defaultClusterResyncTimeout,
syncTime: nil,
},
watchResyncTimeout: defaultWatchResyncTimeout,
clusterSyncRetryTimeout: ClusterRetryTimeout,
resourceUpdatedHandlers: map[uint64]OnResourceUpdatedHandler{},
eventHandlers: map[uint64]OnEventHandler{},
log: log,
listRetryLimit: 1,
listRetryUseBackoff: false,
listRetryFunc: ListRetryFuncNever,
watchResyncTimeout: defaultWatchResyncTimeout,
clusterSyncRetryTimeout: ClusterRetryTimeout,
resourceUpdatedHandlers: map[uint64]OnResourceUpdatedHandler{},
eventHandlers: map[uint64]OnEventHandler{},
log: log,
listRetryLimit: 1,
listRetryUseBackoff: false,
listRetryFunc: RetryFuncNever,
connectionStatus: ConnectionStatusUnknown,
watchFails: newWatchFailures(),
clusterStatusRetryFunc: RetryFuncNever,
clusterConnectionInterval: defaultClusterConnectionInterval,
}
for i := range opts {
opts[i](cache)
}
return cache
}

// ConnectionStatus indicates the status of the connection with the cluster.
type ConnectionStatus string

const (
ConnectionStatusSuccessful ConnectionStatus = "Successful"
ConnectionStatusFailed ConnectionStatus = "Failed"
ConnectionStatusUnknown ConnectionStatus = "Unknown"
)

type clusterCache struct {
syncStatus clusterCacheSync

// connectionStatus indicates the status of the connection with the cluster.
connectionStatus ConnectionStatus

// clusterConnectionInterval is the interval used to monitor the cluster connection status.
clusterConnectionInterval time.Duration

// watchFails is used to keep track of the failures while watching resources.
watchFails *watchFailures

clusterStatusRetryFunc RetryFunc

apisMeta map[schema.GroupKind]*apiMeta
serverVersion string
apiResources []kube.APIResourceInfo
Expand All @@ -200,7 +231,7 @@ type clusterCache struct {
// retry options for list operations
listRetryLimit int32
listRetryUseBackoff bool
listRetryFunc ListRetryFunc
listRetryFunc RetryFunc

// lock is a rw lock which protects the fields of clusterInfo
lock sync.RWMutex
Expand Down Expand Up @@ -236,13 +267,13 @@ type clusterCacheSync struct {
resyncTimeout time.Duration
}

// ListRetryFuncNever never retries on errors
func ListRetryFuncNever(err error) bool {
// RetryFuncNever never retries on errors
func RetryFuncNever(err error) bool {
return false
}

// ListRetryFuncAlways always retries on errors
func ListRetryFuncAlways(err error) bool {
// RetryFuncAlways always retries on errors
func RetryFuncAlways(err error) bool {
return true
}

Expand Down Expand Up @@ -595,6 +626,7 @@ func (c *clusterCache) loadInitialState(ctx context.Context, api kube.APIResourc
}

func (c *clusterCache) watchEvents(ctx context.Context, api kube.APIResourceInfo, resClient dynamic.ResourceInterface, ns string, resourceVersion string) {
watchKey := api.GroupKind.String()
kube.RetryUntilSucceed(ctx, watchResourcesRetryTimeout, fmt.Sprintf("watch %s on %s", api.GroupKind, c.config.Host), c.log, func() (err error) {
defer func() {
if r := recover(); r != nil {
Expand All @@ -615,7 +647,16 @@ func (c *clusterCache) watchEvents(ctx context.Context, api kube.APIResourceInfo
res, err := resClient.Watch(ctx, options)
if errors.IsNotFound(err) {
c.stopWatching(api.GroupKind, ns)
c.watchFails.remove(watchKey)
return res, err
}

if err != nil {
c.watchFails.add(watchKey)
} else {
c.watchFails.remove(watchKey)
}

return res, err
},
})
Expand Down Expand Up @@ -810,8 +851,14 @@ func (c *clusterCache) sync() error {
version, err := c.kubectl.GetServerVersion(config)

if err != nil {
if c.connectionStatus != ConnectionStatusFailed {
c.log.Info("unable to access cluster", "cluster", c.config.Host, "reason", err.Error())
c.connectionStatus = ConnectionStatusFailed
}
return err
}

c.connectionStatus = ConnectionStatusSuccessful
c.serverVersion = version
apiResources, err := c.kubectl.GetAPIResources(config, false, NewNoopSettings())
if err != nil {
Expand Down Expand Up @@ -1186,6 +1233,7 @@ func (c *clusterCache) GetClusterInfo() ClusterInfo {
LastCacheSyncTime: c.syncStatus.syncTime,
SyncError: c.syncStatus.syncError,
APIResources: c.apiResources,
ConnectionStatus: c.connectionStatus,
}
}

Expand All @@ -1194,3 +1242,111 @@ func (c *clusterCache) GetClusterInfo() ClusterInfo {
func skipAppRequeuing(key kube.ResourceKey) bool {
return ignoredRefreshResources[key.Group+"/"+key.Kind]
}

// StartClusterConnectionStatusMonitoring starts a goroutine that checks for watch failures.
// If there are any watch errors, it will periodically ping the remote cluster
// and update the cluster connection status.
func (c *clusterCache) StartClusterConnectionStatusMonitoring(ctx context.Context) {
go c.clusterConnectionService(ctx)
}

func (c *clusterCache) clusterConnectionService(ctx context.Context) {
if c.clusterConnectionInterval <= 0 {
return
}

ticker := time.NewTicker(c.clusterConnectionInterval)
defer ticker.Stop()

for {
select {
case <-ticker.C:
watchErrors := c.watchFails.len()
// Ping the cluster for connection verification if there are watch failures or
// if the cluster has recovered back from watch failures.
watchesRecovered := false
if watchErrors == 0 {
// If there are no watch failures check if the status needs to be updated.
c.lock.RLock()
if c.connectionStatus == ConnectionStatusFailed {
watchesRecovered = true
}
c.lock.RUnlock()
}

if watchErrors > 0 || watchesRecovered {
c.log.V(1).Info("verifying cluster connection", "server", c.config.Host)
// Retry fetching the server version to avoid invalidating the cache due to transient errors.
err := retry.OnError(retry.DefaultBackoff, c.clusterStatusRetryFunc, func() error {
_, err := c.kubectl.GetServerVersion(c.config)
if err != nil && c.clusterStatusRetryFunc(err) {
c.log.V(1).Info("Error while fetching server version", "error", err.Error())
}
return err
})
if err != nil {
c.updateConnectionStatus(ConnectionStatusFailed)
} else {
c.updateConnectionStatus(ConnectionStatusSuccessful)
}
}
case <-ctx.Done():
c.log.V(1).Info("Stopping cluster connection status monitoring", "server", c.config.Host)
ticker.Stop()
return
}
}

}

func (c *clusterCache) updateConnectionStatus(status ConnectionStatus) {
invalidateCache := false
c.lock.Lock()
if c.connectionStatus != status {
c.connectionStatus = status
invalidateCache = true
}
c.lock.Unlock()

if !invalidateCache {
return
}

c.log.V(1).Info("updated cluster connection status", "server", c.config.Host, "status", status)

c.Invalidate()
if err := c.EnsureSynced(); err != nil {
c.log.Error(err, "failed to sync cache state after updating cluster connection status", "server", c.config.Host)
}
}

// watchFailures is used to keep track of the failures while watching resources. It is updated
// whenever an error occurs during watch or when the watch recovers back from a failure.
type watchFailures struct {
watches map[string]bool
mu sync.RWMutex
}

func newWatchFailures() *watchFailures {
return &watchFailures{
watches: make(map[string]bool),
}
}

func (w *watchFailures) add(key string) {
w.mu.Lock()
defer w.mu.Unlock()
w.watches[key] = true
}

func (w *watchFailures) remove(key string) {
w.mu.Lock()
defer w.mu.Unlock()
delete(w.watches, key)
}

func (w *watchFailures) len() int {
w.mu.RLock()
defer w.mu.RUnlock()
return len(w.watches)
}
Loading
Loading