From f6f5fd21b45971368dfcb03f1eed3a43cfd36605 Mon Sep 17 00:00:00 2001 From: David Ashpole Date: Thu, 5 Mar 2020 11:53:40 -0800 Subject: [PATCH] handle the new 5.0 linux format for oom messages --- utils/oomparser/oomparser.go | 40 +++++++++++++++++---- utils/oomparser/oomparser_test.go | 59 +++++++++++++++++++++++++++---- 2 files changed, 86 insertions(+), 13 deletions(-) diff --git a/utils/oomparser/oomparser.go b/utils/oomparser/oomparser.go index ea50c46308..68ffd64d8c 100644 --- a/utils/oomparser/oomparser.go +++ b/utils/oomparser/oomparser.go @@ -26,7 +26,9 @@ import ( ) var ( - containerRegexp = regexp.MustCompile(`Task in (.*) killed as a result of limit of (.*)`) + legacyContainerRegexp = regexp.MustCompile(`Task in (.*) killed as a result of limit of (.*)`) + // Starting in 5.0 linux kernels, the OOM message changed + containerRegexp = regexp.MustCompile(`oom-kill:constraint=(.*),nodemask=(.*),cpuset=(.*),mems_allowed=(.*),oom_memcg=(.*) (.*),task_memcg=(.*),task=(.*),pid=(.*),uid=(.*)`) lastLineRegexp = regexp.MustCompile(`Killed process ([0-9]+) \((.+)\)`) firstLineRegexp = regexp.MustCompile(`invoked oom-killer:`) ) @@ -51,11 +53,14 @@ type OomInstance struct { // the absolute name of the container that was killed // due to the OOM. VictimContainerName string + // the constraint that triggered the OOM. One of CONSTRAINT_NONE, + // CONSTRAINT_CPUSET, CONSTRAINT_MEMORY_POLICY, CONSTRAINT_MEMCG + Constraint string } // gets the container name from a line and adds it to the oomInstance. -func getContainerName(line string, currentOomInstance *OomInstance) error { - parsedLine := containerRegexp.FindStringSubmatch(line) +func getLegacyContainerName(line string, currentOomInstance *OomInstance) error { + parsedLine := legacyContainerRegexp.FindStringSubmatch(line) if parsedLine == nil { return nil } @@ -64,6 +69,25 @@ func getContainerName(line string, currentOomInstance *OomInstance) error { return nil } +// gets the container name from a line and adds it to the oomInstance. +func getContainerName(line string, currentOomInstance *OomInstance) (bool, error) { + parsedLine := containerRegexp.FindStringSubmatch(line) + if parsedLine == nil { + // Fall back to the legacy format if it isn't found here. + return false, getLegacyContainerName(line, currentOomInstance) + } + currentOomInstance.ContainerName = parsedLine[7] + currentOomInstance.VictimContainerName = parsedLine[5] + currentOomInstance.Constraint = parsedLine[1] + pid, err := strconv.Atoi(parsedLine[9]) + if err != nil { + return false, err + } + currentOomInstance.Pid = pid + currentOomInstance.ProcessName = parsedLine[8] + return true, nil +} + // gets the pid, name, and date from a line and adds it to oomInstance func getProcessNamePid(line string, currentOomInstance *OomInstance) (bool, error) { reList := lastLineRegexp.FindStringSubmatch(line) @@ -106,13 +130,15 @@ func (self *OomParser) StreamOoms(outStream chan<- *OomInstance) { TimeOfDeath: msg.Timestamp, } for msg := range kmsgEntries { - err := getContainerName(msg.Message, oomCurrentInstance) + finished, err := getContainerName(msg.Message, oomCurrentInstance) if err != nil { klog.Errorf("%v", err) } - finished, err := getProcessNamePid(msg.Message, oomCurrentInstance) - if err != nil { - klog.Errorf("%v", err) + if !finished { + finished, err = getProcessNamePid(msg.Message, oomCurrentInstance) + if err != nil { + klog.Errorf("%v", err) + } } if finished { oomCurrentInstance.TimeOfDeath = msg.Timestamp diff --git a/utils/oomparser/oomparser_test.go b/utils/oomparser/oomparser_test.go index e750775a0b..8700007d77 100644 --- a/utils/oomparser/oomparser_test.go +++ b/utils/oomparser/oomparser_test.go @@ -23,23 +23,33 @@ import ( "github.com/stretchr/testify/assert" ) -const startLine = "ruby invoked oom-killer: gfp_mask=0x201da, order=0, oom_score_adj=0" -const endLine = "Killed process 19667 (evil-program2) total-vm:1460016kB, anon-rss:1414008kB, file-rss:4kB" -const containerLine = "Task in /mem2 killed as a result of limit of /mem3" +const ( + startLine = "ruby invoked oom-killer: gfp_mask=0x201da, order=0, oom_score_adj=0" + endLine = "Killed process 19667 (evil-program2) total-vm:1460016kB, anon-rss:1414008kB, file-rss:4kB" + legacyContainerLine = "Task in /mem2 killed as a result of limit of /mem3" + containerLine = "oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),cpuset=ef807430361e6e82b45db92e2e9b6fbec98f419b12c591e655c1a725565e73a8,mems_allowed=0,oom_memcg=/kubepods/burstable/podfbdfe8e3-1c87-4ff2-907 c-b2ec8e25d012,task_memcg=/kubepods/burstable/podfbdfe8e3-1c87-4ff2-907c-b2ec8e25d012/ef807430361e6e82b45db92e2e9b6fbec98f419b12c591e655c1a725565e73a8,task=manager,pid=966,uid=0" +) -func TestGetContainerName(t *testing.T) { +func TestGetLegacyContainerName(t *testing.T) { currentOomInstance := new(OomInstance) - err := getContainerName(startLine, currentOomInstance) + finished, err := getContainerName(startLine, currentOomInstance) if err != nil { t.Errorf("bad line fed to getContainerName should yield no error, but had error %v", err) } + if finished { + t.Errorf("bad line fed to getContainerName should not result in a finished oom log, but it did") + } if currentOomInstance.ContainerName != "" { t.Errorf("bad line fed to getContainerName yielded no container name but set it to %s", currentOomInstance.ContainerName) } - err = getContainerName(containerLine, currentOomInstance) + finished, err = getContainerName(legacyContainerLine, currentOomInstance) if err != nil { t.Errorf("container line fed to getContainerName should yield no error, but had error %v", err) } + if finished { + t.Errorf("getContainerName with the legacy log line should not result in a finished oom log, but it did") + + } if currentOomInstance.ContainerName != "/mem2" { t.Errorf("getContainerName should have set containerName to /mem2, not %s", currentOomInstance.ContainerName) } @@ -48,6 +58,43 @@ func TestGetContainerName(t *testing.T) { } } +func TestGetContainerName(t *testing.T) { + currentOomInstance := new(OomInstance) + finished, err := getContainerName(startLine, currentOomInstance) + if err != nil { + t.Errorf("bad line fed to getContainerName should yield no error, but had error %v", err) + } + if finished { + t.Errorf("bad line fed to getContainerName should not result in a finished oom log, but it did") + } + if currentOomInstance.ContainerName != "" { + t.Errorf("bad line fed to getContainerName yielded no container name but set it to %s", currentOomInstance.ContainerName) + } + finished, err = getContainerName(containerLine, currentOomInstance) + if err != nil { + t.Errorf("container line fed to getContainerName should yield no error, but had error %v", err) + } + if !finished { + t.Errorf("getContainerName with the complete log line should result in a finished oom log, but it did not") + + } + if currentOomInstance.ContainerName != "/kubepods/burstable/podfbdfe8e3-1c87-4ff2-907c-b2ec8e25d012/ef807430361e6e82b45db92e2e9b6fbec98f419b12c591e655c1a725565e73a8" { + t.Errorf("getContainerName should have set containerName to /kubepods/burstable/podfbdfe8e3-1c87-4ff2-907c-b2ec8e25d012/ef807430361e6e82b45db92e2e9b6fbec98f419b12c591e655c1a725565e73a8, not %s", currentOomInstance.ContainerName) + } + if currentOomInstance.VictimContainerName != "/kubepods/burstable/podfbdfe8e3-1c87-4ff2-907" { + t.Errorf("getContainerName should have set victimContainerName to /kubepods/burstable/podfbdfe8e3-1c87-4ff2-907, not %s", currentOomInstance.VictimContainerName) + } + if currentOomInstance.Pid != 966 { + t.Errorf("getContainerName should have set Pid to 966, not %d", currentOomInstance.Pid) + } + if currentOomInstance.ProcessName != "manager" { + t.Errorf("getContainerName should have set ProcessName to manager, not %s", currentOomInstance.ProcessName) + } + if currentOomInstance.Constraint != "CONSTRAINT_MEMCG" { + t.Errorf("getContainerName should have set ProcessName to CONSTRAINT_MEMCG, not %s", currentOomInstance.Constraint) + } +} + func TestGetProcessNamePid(t *testing.T) { currentOomInstance := new(OomInstance) couldParseLine, err := getProcessNamePid(startLine, currentOomInstance)