Skip to content

Commit

Permalink
Report SLO Errors only after 20 minutes for DualStack ILB
Browse files Browse the repository at this point in the history
  • Loading branch information
panslava committed Feb 11, 2023
1 parent 45fc392 commit a073bdb
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 34 deletions.
26 changes: 26 additions & 0 deletions pkg/l4lb/l4controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,32 @@ func TestProcessDualStackServiceOnUserError(t *testing.T) {
}
}

func TestDualStackILBStatusForErrorSync(t *testing.T) {
l4c := newServiceController(t, newFakeGCEWithUserNoIPv6SubnetError())
l4c.enableDualStack = true
(l4c.ctx.Cloud.Compute().(*cloud.MockGCE)).MockForwardingRules.InsertHook = mock.InsertForwardingRulesInternalErrHook

newSvc := test.NewL4ILBDualStackService(8080, api_v1.ProtocolTCP, []api_v1.IPFamily{api_v1.IPv4Protocol, api_v1.IPv6Protocol}, api_v1.ServiceExternalTrafficPolicyTypeCluster)
addILBService(l4c, newSvc)
addNEG(l4c, newSvc)
syncResult := l4c.processServiceCreateOrUpdate(newSvc)
if syncResult.Error == nil {
t.Fatalf("Failed to generate error when syncing service %s", newSvc.Name)
}
if syncResult.MetricsState.IsUserError {
t.Errorf("syncResult.MetricsState.IsUserError should be false, got true")
}
if syncResult.MetricsState.InSuccess {
t.Errorf("syncResult.MetricsState.InSuccess should be false, got true")
}
if syncResult.DualStackMetricsState.Status != metrics.StatusError {
t.Errorf("syncResult.DualStackMetricsState.Status should be %s, got %s", metrics.StatusError, syncResult.DualStackMetricsState.Status)
}
if syncResult.DualStackMetricsState.FirstSyncErrorTime == nil {
t.Fatalf("Metric status FirstSyncErrorTime for service %s/%s mismatch, expected: not nil, received: nil", newSvc.Namespace, newSvc.Name)
}
}

func TestProcessUpdateILBIPFamilies(t *testing.T) {
testCases := []struct {
desc string
Expand Down
7 changes: 6 additions & 1 deletion pkg/loadbalancers/l4.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ func (l4 *L4) EnsureInternalLoadBalancer(nodeNames []string, svc *corev1.Service
}
if l4.enableDualStack {
result.DualStackMetricsState.Status = metrics.StatusSuccess
result.DualStackMetricsState.FirstSyncErrorTime = nil
}
return result
}
Expand Down Expand Up @@ -534,8 +535,12 @@ func (l4 *L4) hasAnnotation(annotationKey string) bool {

func (l4 *L4) getInitialDualStackMetricsState() metrics.L4ILBDualStackServiceState {
// Always init stats with error, and update with Success when service was provisioned
curTime := time.Now()
state := metrics.L4ILBDualStackServiceState{
Status: metrics.StatusError,
L4ILBDualStackServiceCount: metrics.L4ILBDualStackServiceCount{
Status: metrics.StatusError,
},
FirstSyncErrorTime: &curTime,
}

var ipFamiliesStrings []string
Expand Down
106 changes: 80 additions & 26 deletions pkg/metrics/l4metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -431,51 +431,95 @@ func checkMetricsComputation(newMetrics *ControllerMetrics, expErrorCount, expSv
return nil
}

func TestRetryPeriodForL4ILBDualStackServices(t *testing.T) {
t.Parallel()
currTime := time.Now()
before5min := currTime.Add(-5 * time.Minute)

svcName1 := "svc1"
metrics := FakeControllerMetrics()

errorState := newL4ILBDualStackServiceState("IPv4", "SingleStack", StatusError, &currTime)
metrics.SetL4ILBDualStackService(svcName1, errorState)

// change FirstSyncErrorTime and verify it will not change metrics state
errorState.FirstSyncErrorTime = &before5min
metrics.SetL4ILBDualStackService(svcName1, errorState)
state, ok := metrics.l4ILBDualStackServiceMap[svcName1]
if !ok {
t.Fatalf("state should be set")
}
if *state.FirstSyncErrorTime != currTime {
t.Errorf("FirstSyncErrorTime should not change, expected %v, got %v", currTime, *state.FirstSyncErrorTime)
}
if state.Status != StatusError {
t.Errorf("Expected status %s, got %s", StatusError, state.Status)
}
}

func TestComputeL4ILBDualStackMetrics(t *testing.T) {
t.Parallel()

currTime := time.Now()
before10min := currTime.Add(-10 * time.Minute)
before20min := currTime.Add(-20 * time.Minute)

for _, tc := range []struct {
desc string
serviceStates []L4ILBDualStackServiceState
expectL4ILBDualStackCount map[L4ILBDualStackServiceState]int
expectL4ILBDualStackCount map[L4ILBDualStackServiceCount]int
}{
{
desc: "empty input",
serviceStates: []L4ILBDualStackServiceState{},
expectL4ILBDualStackCount: map[L4ILBDualStackServiceState]int{},
expectL4ILBDualStackCount: map[L4ILBDualStackServiceCount]int{},
},
{
desc: "one l4 ilb dual-stack service",
serviceStates: []L4ILBDualStackServiceState{
newL4ILBDualStackServiceState("IPv4", "SingleStack", StatusSuccess),
newL4ILBDualStackServiceState("IPv4", "SingleStack", StatusSuccess, nil),
},
expectL4ILBDualStackCount: map[L4ILBDualStackServiceState]int{
L4ILBDualStackServiceState{
expectL4ILBDualStackCount: map[L4ILBDualStackServiceCount]int{
L4ILBDualStackServiceCount{
"IPv4",
"SingleStack",
StatusSuccess,
}: 1,
},
},
{
desc: "l4 ilb dual-stack service in error state",
desc: "l4 ilb dual-stack service in error state, for less than 20 minutes",
serviceStates: []L4ILBDualStackServiceState{
newL4ILBDualStackServiceState("IPv4", "SingleStack", StatusError),
newL4ILBDualStackServiceState("IPv4", "SingleStack", StatusError, &before10min),
},
expectL4ILBDualStackCount: map[L4ILBDualStackServiceState]int{
L4ILBDualStackServiceState{
expectL4ILBDualStackCount: map[L4ILBDualStackServiceCount]int{
L4ILBDualStackServiceCount{
"IPv4",
"SingleStack",
StatusError,
}: 1,
},
},
{
desc: "l4 ilb dual-stack service in error state, for 20 minutes",
serviceStates: []L4ILBDualStackServiceState{
newL4ILBDualStackServiceState("IPv4", "SingleStack", StatusError, &before20min),
},
expectL4ILBDualStackCount: map[L4ILBDualStackServiceCount]int{
L4ILBDualStackServiceCount{
"IPv4",
"SingleStack",
StatusSLOError,
}: 1,
},
},
{
desc: "L4 ILB dual-stack service with IPv4,IPv6 Families",
serviceStates: []L4ILBDualStackServiceState{
newL4ILBDualStackServiceState("IPv4,IPv6", "RequireDualStack", StatusSuccess),
newL4ILBDualStackServiceState("IPv4,IPv6", "RequireDualStack", StatusSuccess, nil),
},
expectL4ILBDualStackCount: map[L4ILBDualStackServiceState]int{
L4ILBDualStackServiceState{
expectL4ILBDualStackCount: map[L4ILBDualStackServiceCount]int{
L4ILBDualStackServiceCount{
"IPv4,IPv6",
"RequireDualStack",
StatusSuccess,
Expand All @@ -485,24 +529,31 @@ func TestComputeL4ILBDualStackMetrics(t *testing.T) {
{
desc: "many l4 ilb dual-stack services",
serviceStates: []L4ILBDualStackServiceState{
newL4ILBDualStackServiceState("IPv4,IPv6", "RequireDualStack", StatusSuccess),
newL4ILBDualStackServiceState("IPv4,IPv6", "RequireDualStack", StatusSuccess),
newL4ILBDualStackServiceState("IPv4", "SingleStack", StatusError),
newL4ILBDualStackServiceState("IPv6", "SingleStack", StatusSuccess),
newL4ILBDualStackServiceState("IPv6", "SingleStack", StatusSuccess),
},
expectL4ILBDualStackCount: map[L4ILBDualStackServiceState]int{
L4ILBDualStackServiceState{
newL4ILBDualStackServiceState("IPv4,IPv6", "RequireDualStack", StatusSuccess, nil),
newL4ILBDualStackServiceState("IPv4,IPv6", "RequireDualStack", StatusSuccess, nil),
newL4ILBDualStackServiceState("IPv4", "SingleStack", StatusError, nil),
newL4ILBDualStackServiceState("IPv4", "SingleStack", StatusError, &before10min),
newL4ILBDualStackServiceState("IPv4", "SingleStack", StatusError, &before20min),
newL4ILBDualStackServiceState("IPv6", "SingleStack", StatusSuccess, nil),
newL4ILBDualStackServiceState("IPv6", "SingleStack", StatusSuccess, nil),
},
expectL4ILBDualStackCount: map[L4ILBDualStackServiceCount]int{
L4ILBDualStackServiceCount{
"IPv4,IPv6",
"RequireDualStack",
StatusSuccess,
}: 2,
L4ILBDualStackServiceState{
L4ILBDualStackServiceCount{
"IPv4",
"SingleStack",
StatusError,
}: 2,
L4ILBDualStackServiceCount{
"IPv4",
"SingleStack",
StatusSLOError,
}: 1,
L4ILBDualStackServiceState{
L4ILBDualStackServiceCount{
"IPv6",
"SingleStack",
StatusSuccess,
Expand All @@ -525,10 +576,13 @@ func TestComputeL4ILBDualStackMetrics(t *testing.T) {
}
}

func newL4ILBDualStackServiceState(ipFamilies string, ipFamilyPolicy string, status L4ILBDualStackServiceStateStatus) L4ILBDualStackServiceState {
func newL4ILBDualStackServiceState(ipFamilies string, ipFamilyPolicy string, status L4ILBDualStackServiceStateStatus, firstSyncErrorTime *time.Time) L4ILBDualStackServiceState {
return L4ILBDualStackServiceState{
IPFamilies: ipFamilies,
IPFamilyPolicy: ipFamilyPolicy,
Status: status,
L4ILBDualStackServiceCount: L4ILBDualStackServiceCount{
ipFamilies,
ipFamilyPolicy,
status,
},
FirstSyncErrorTime: firstSyncErrorTime,
}
}
21 changes: 17 additions & 4 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ const (
// Env variable for ingress version
versionVar = "INGRESS_VERSION"
// Dummy float so we can used bool based timeseries
versionValue = 1.0
versionValue = 1.0
dualstackILBSLOErrorTime = 20 * time.Minute
)

var (
Expand Down Expand Up @@ -293,6 +294,13 @@ func (im *ControllerMetrics) SetL4ILBDualStackService(svcKey string, state L4ILB
if im.l4ILBDualStackServiceMap == nil {
klog.Fatalf("L4 ILB DualStack Metrics failed to initialize correctly.")
}

if state.Status == StatusError {
if previousState, ok := im.l4ILBDualStackServiceMap[svcKey]; ok && previousState.FirstSyncErrorTime != nil {
// If service is in Error state and retry timestamp was set then do not update it.
state.FirstSyncErrorTime = previousState.FirstSyncErrorTime
}
}
im.l4ILBDualStackServiceMap[svcKey] = state
}

Expand Down Expand Up @@ -568,15 +576,20 @@ func (im *ControllerMetrics) computeL4ILBMetrics() map[feature]int {
}

// computeL4ILBDualStackMetrics aggregates L4 ILB DualStack metrics in the cache.
func (im *ControllerMetrics) computeL4ILBDualStackMetrics() map[L4ILBDualStackServiceState]int {
func (im *ControllerMetrics) computeL4ILBDualStackMetrics() map[L4ILBDualStackServiceCount]int {
im.Lock()
defer im.Unlock()
klog.V(4).Infof("Computing L4 DualStack ILB usage metrics from service state map: %#v", im.l4ILBDualStackServiceMap)
counts := map[L4ILBDualStackServiceState]int{}
counts := map[L4ILBDualStackServiceCount]int{}

for key, state := range im.l4ILBDualStackServiceMap {
klog.V(6).Infof("ILB Service %s has IPFamilies: %v, IPFamilyPolicy: %t, Status: %v", key, state.IPFamilies, state.IPFamilyPolicy, state.Status)
counts[state]++
if state.Status == StatusError &&
state.FirstSyncErrorTime != nil &&
time.Since(*state.FirstSyncErrorTime) > dualstackILBSLOErrorTime {
state.Status = StatusSLOError
}
counts[state.L4ILBDualStackServiceCount]++
}
klog.V(4).Info("L4 ILB usage metrics computed.")
return counts
Expand Down
15 changes: 12 additions & 3 deletions pkg/metrics/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,11 @@ type L4ILBDualStackServiceStateStatus string
const StatusSuccess = L4ILBDualStackServiceStateStatus("Success")
const StatusUserError = L4ILBDualStackServiceStateStatus("UserError")
const StatusError = L4ILBDualStackServiceStateStatus("Error")
const StatusSLOError = L4ILBDualStackServiceStateStatus("SLOError")

// L4ILBDualStackServiceState defines ipFamilies, ipFamilyPolicy and status
// L4ILBDualStackServiceCount defines ipFamilies, ipFamilyPolicy and status
// of L4 ILB DualStack service
type L4ILBDualStackServiceState struct {
type L4ILBDualStackServiceCount struct {
// IPFamilies stores spec.ipFamilies of Service
IPFamilies string
// IPFamilyPolicy specifies spec.IPFamilyPolicy of Service
Expand All @@ -90,8 +91,16 @@ type L4ILBDualStackServiceState struct {
Status L4ILBDualStackServiceStateStatus
}

// L4ILBDualStackServiceState defines ipFamilies, ipFamilyPolicy, status and tracks
// FirstSyncErrorTime of L4 ILB DualStack service
type L4ILBDualStackServiceState struct {
L4ILBDualStackServiceCount
// FirstSyncErrorTime specifies the time timestamp when the service sync ended up with error for the first time.
FirstSyncErrorTime *time.Time
}

// L4NetLBServiceState defines if network tier is premium and
// if static ip address is managed bu controller
// if static ip address is managed by controller
// for an L4 NetLB service.
type L4NetLBServiceState struct {
// IsManagedIP specifies if Static IP is managed by controller.
Expand Down

0 comments on commit a073bdb

Please sign in to comment.