From 65c2d3c2b68ad6bd8a42ffb0b5a5a6a79b2f2ea1 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 18 Feb 2021 18:50:10 -0800 Subject: [PATCH 01/10] tests/int/update: add test case for PR #592 Add integration test coverage for code initially added by commit d8b8f76c4ff584 ("Fix problem when update memory and swap memory", 2016-04-05). This is in addition to existing unit test, TestMemorySetSwapSmallerThanMemory. Signed-off-by: Kir Kolyshkin --- tests/integration/update.bats | 36 +++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/integration/update.bats b/tests/integration/update.bats index c03e7c6def0..5e6377d09be 100644 --- a/tests/integration/update.bats +++ b/tests/integration/update.bats @@ -238,6 +238,42 @@ EOF check_cgroup_value "pids.max" 20 check_systemd_value "TasksMax" 20 + + if [ "$HAVE_SWAP" = "yes" ]; then + # Test case for https://github.com/opencontainers/runc/pull/592, + # checking libcontainer/cgroups/fs/memory.go:setMemoryAndSwap. + + runc update test_update --memory 30M --memory-swap 50M + [ "$status" -eq 0 ] + + check_cgroup_value $MEM_LIMIT $((30 * 1024 * 1024)) + check_systemd_value $SD_MEM_LIMIT $((30 * 1024 * 1024)) + + if [ "$CGROUP_UNIFIED" = "yes" ]; then + # for cgroupv2, swap does not include mem + check_cgroup_value "$MEM_SWAP" $((20 * 1024 * 1024)) + check_systemd_value "$SD_MEM_SWAP" $((20 * 1024 * 1024)) + else + check_cgroup_value "$MEM_SWAP" $((50 * 1024 * 1024)) + check_systemd_value "$SD_MEM_SWAP" $((50 * 1024 * 1024)) + fi + + # Now, set new memory to more than old swap + runc update test_update --memory 60M --memory-swap 80M + [ "$status" -eq 0 ] + + check_cgroup_value $MEM_LIMIT $((60 * 1024 * 1024)) + check_systemd_value $SD_MEM_LIMIT $((60 * 1024 * 1024)) + + if [ "$CGROUP_UNIFIED" = "yes" ]; then + # for cgroupv2, swap does not include mem + check_cgroup_value "$MEM_SWAP" $((20 * 1024 * 1024)) + check_systemd_value "$SD_MEM_SWAP" $((20 * 1024 * 1024)) + else + check_cgroup_value "$MEM_SWAP" $((80 * 1024 * 1024)) + check_systemd_value "$SD_MEM_SWAP" $((80 * 1024 * 1024)) + fi + fi } @test "update cgroup cpu limits" { From 3cced523a5b51712d095403e07a97589ff384a8c Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 18 Feb 2021 17:52:09 -0800 Subject: [PATCH 02/10] libct/cg/fs/memory: optimize Set Currently, we read and parse 5 different files while we only need 1. Use GetCgroupParamUint() directly to get current limit. While at it, remove the workaround previously needed for the unit test, and make it a bit more verbose. Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/fs/memory.go | 4 ++-- libcontainer/cgroups/fs/memory_test.go | 9 ++------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go index 61be6d7e9a6..a480ed94eb7 100644 --- a/libcontainer/cgroups/fs/memory.go +++ b/libcontainer/cgroups/fs/memory.go @@ -70,7 +70,7 @@ func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { // When memory and swap memory are both set, we need to handle the cases // for updating container. if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 { - memoryUsage, err := getMemoryData(path, "") + curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit) if err != nil { return err } @@ -78,7 +78,7 @@ func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { // When update memory limit, we should adapt the write sequence // for memory and swap memory, so it won't fail because the new // value and the old value don't fit kernel's validation. - if cgroup.Resources.MemorySwap == -1 || memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) { + if cgroup.Resources.MemorySwap == -1 || curLimit < uint64(cgroup.Resources.MemorySwap) { if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { return err } diff --git a/libcontainer/cgroups/fs/memory_test.go b/libcontainer/cgroups/fs/memory_test.go index 00fb338b767..44f1c9a9175 100644 --- a/libcontainer/cgroups/fs/memory_test.go +++ b/libcontainer/cgroups/fs/memory_test.go @@ -165,11 +165,6 @@ func TestMemorySetSwapSmallerThanMemory(t *testing.T) { helper.writeFileContents(map[string]string{ "memory.limit_in_bytes": strconv.Itoa(memoryBefore), "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), - // Set will call getMemoryData when memory and swap memory are - // both set, fake these fields so we don't get error. - "memory.usage_in_bytes": "0", - "memory.max_usage_in_bytes": "0", - "memory.failcnt": "0", }) helper.CgroupData.config.Resources.Memory = memoryAfter @@ -184,14 +179,14 @@ func TestMemorySetSwapSmallerThanMemory(t *testing.T) { t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err) } if value != memoryAfter { - t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") + t.Fatalf("Got the wrong value (%d != %d), set memory.limit_in_bytes failed", value, memoryAfter) } value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes") if err != nil { t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err) } if value != memoryswapAfter { - t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") + t.Fatalf("Got the wrong value (%d != %d), set memory.memsw.limit_in_bytes failed", value, memoryswapAfter) } } From 27fd3fc3ce423948f7ce5610a03518f5df0ff55b Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 18 Feb 2021 18:00:30 -0800 Subject: [PATCH 03/10] libct/cg/fs: setMemoryAndSwap: refactor 1. Factor out setMemory and setSwap 2. Pass cgroup.Resources (rather than cgroup) to setMemoryAndSwap(). 3. Merge the duplicated "set memory, set swap" case. Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/fs/memory.go | 58 +++++++++++++++++-------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go index a480ed94eb7..66e9c8324e7 100644 --- a/libcontainer/cgroups/fs/memory.go +++ b/libcontainer/cgroups/fs/memory.go @@ -57,19 +57,35 @@ func (s *MemoryGroup) Apply(path string, d *cgroupData) (err error) { return join(path, d.pid) } -func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { +func setMemory(path string, val int64) error { + if val == 0 { + return nil + } + + return fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10)) +} + +func setSwap(path string, val int64) error { + if val == 0 { + return nil + } + + return fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10)) +} + +func setMemoryAndSwap(path string, r *configs.Resources) error { // If the memory update is set to -1 and the swap is not explicitly // set, we should also set swap to -1, it means unlimited memory. - if cgroup.Resources.Memory == -1 && cgroup.Resources.MemorySwap == 0 { + if r.Memory == -1 && r.MemorySwap == 0 { // Only set swap if it's enabled in kernel if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) { - cgroup.Resources.MemorySwap = -1 + r.MemorySwap = -1 } } // When memory and swap memory are both set, we need to handle the cases // for updating container. - if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 { + if r.Memory != 0 && r.MemorySwap != 0 { curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit) if err != nil { return err @@ -78,39 +94,29 @@ func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { // When update memory limit, we should adapt the write sequence // for memory and swap memory, so it won't fail because the new // value and the old value don't fit kernel's validation. - if cgroup.Resources.MemorySwap == -1 || curLimit < uint64(cgroup.Resources.MemorySwap) { - if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { - return err - } - if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { - return err - } - } else { - if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) { + if err := setSwap(path, r.MemorySwap); err != nil { return err } - if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { - return err - } - } - } else { - if cgroup.Resources.Memory != 0 { - if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { - return err - } - } - if cgroup.Resources.MemorySwap != 0 { - if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + if err := setMemory(path, r.Memory); err != nil { return err } + return nil } } + if err := setMemory(path, r.Memory); err != nil { + return err + } + if err := setSwap(path, r.MemorySwap); err != nil { + return err + } + return nil } func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { - if err := setMemoryAndSwap(path, cgroup); err != nil { + if err := setMemoryAndSwap(path, cgroup.Resources); err != nil { return err } From 1880d2fc052d80dcf2b8e39fef3766ae25a9efc5 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 18 Feb 2021 19:41:45 -0800 Subject: [PATCH 04/10] libct/cg/fs/memory: handle EBUSY EBUSY when trying to set memory limit may mean the new limit is too low (lower than the current usage, and the kernel can't do anything). Provide a more specific error for such case. Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/fs/memory.go | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go index 66e9c8324e7..9b521fe2635 100644 --- a/libcontainer/cgroups/fs/memory.go +++ b/libcontainer/cgroups/fs/memory.go @@ -14,11 +14,15 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "golang.org/x/sys/unix" ) const ( cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" cgroupMemoryLimit = "memory.limit_in_bytes" + cgroupMemoryUsage = "memory.usage_in_bytes" + cgroupMemoryMaxUsage = "memory.max_usage_in_bytes" ) type MemoryGroup struct { @@ -62,7 +66,23 @@ func setMemory(path string, val int64) error { return nil } - return fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10)) + err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10)) + if !errors.Is(err, unix.EBUSY) { + return err + } + + // EBUSY means the kernel can't set new limit as it's too low + // (lower than the current usage). Return more specific error. + usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage) + if err != nil { + return err + } + max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage) + if err != nil { + return err + } + + return errors.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max) } func setSwap(path string, val int64) error { From 494f900e917abfea0906644285f37aa55316c686 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 19 Feb 2021 02:10:51 -0800 Subject: [PATCH 05/10] libct/cg/fscommon: rename/facelift GetCgroupParamKeyValue 1. This is the only function in the package with Get prefix that does not read a file (but parses a string). Rename accordingly, and convert the callers. GetCgroupParamKeyValue -> ParseKeyValue 2. Use strings.Split rather than strings.Fields. Split by a space is 2x faster, plus we can limit the splitting. The downside is we have to strip a newline in one of the callers. 3. Improve the doc and the code flow. 4. Fix a test case with invalid data (spaces at BOL). Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/fs/cpu.go | 2 +- libcontainer/cgroups/fs/cpu_test.go | 2 +- libcontainer/cgroups/fs/memory.go | 2 +- libcontainer/cgroups/fs2/cpu.go | 2 +- libcontainer/cgroups/fs2/hugetlb.go | 3 ++- libcontainer/cgroups/fs2/memory.go | 2 +- libcontainer/cgroups/fscommon/utils.go | 28 +++++++++++++------------- 7 files changed, 21 insertions(+), 20 deletions(-) diff --git a/libcontainer/cgroups/fs/cpu.go b/libcontainer/cgroups/fs/cpu.go index 1d5f455d6a4..194f3a542e3 100644 --- a/libcontainer/cgroups/fs/cpu.go +++ b/libcontainer/cgroups/fs/cpu.go @@ -97,7 +97,7 @@ func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { sc := bufio.NewScanner(f) for sc.Scan() { - t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + t, v, err := fscommon.ParseKeyValue(sc.Text()) if err != nil { return err } diff --git a/libcontainer/cgroups/fs/cpu_test.go b/libcontainer/cgroups/fs/cpu_test.go index 4a8ecf9950e..9adc9b31cab 100644 --- a/libcontainer/cgroups/fs/cpu_test.go +++ b/libcontainer/cgroups/fs/cpu_test.go @@ -112,7 +112,7 @@ func TestCpuStats(t *testing.T) { throttledTime = uint64(18446744073709551615) ) - cpuStatContent := fmt.Sprintf("nr_periods %d\n nr_throttled %d\n throttled_time %d\n", + cpuStatContent := fmt.Sprintf("nr_periods %d\nnr_throttled %d\nthrottled_time %d\n", nrPeriods, nrThrottled, throttledTime) helper.writeFileContents(map[string]string{ "cpu.stat": cpuStatContent, diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go index 9b521fe2635..5d52d325793 100644 --- a/libcontainer/cgroups/fs/memory.go +++ b/libcontainer/cgroups/fs/memory.go @@ -188,7 +188,7 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { sc := bufio.NewScanner(statsFile) for sc.Scan() { - t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + t, v, err := fscommon.ParseKeyValue(sc.Text()) if err != nil { return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err) } diff --git a/libcontainer/cgroups/fs2/cpu.go b/libcontainer/cgroups/fs2/cpu.go index 0dffe6648e9..b7331338fd6 100644 --- a/libcontainer/cgroups/fs2/cpu.go +++ b/libcontainer/cgroups/fs2/cpu.go @@ -57,7 +57,7 @@ func statCpu(dirPath string, stats *cgroups.Stats) error { sc := bufio.NewScanner(f) for sc.Scan() { - t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + t, v, err := fscommon.ParseKeyValue(sc.Text()) if err != nil { return err } diff --git a/libcontainer/cgroups/fs2/hugetlb.go b/libcontainer/cgroups/fs2/hugetlb.go index 18cd411ce08..deeee34c5e1 100644 --- a/libcontainer/cgroups/fs2/hugetlb.go +++ b/libcontainer/cgroups/fs2/hugetlb.go @@ -4,6 +4,7 @@ package fs2 import ( "strconv" + "strings" "github.com/pkg/errors" @@ -48,7 +49,7 @@ func statHugeTlb(dirPath string, stats *cgroups.Stats) error { if err != nil { return errors.Wrap(err, "failed to read stats") } - _, value, err = fscommon.GetCgroupParamKeyValue(contents) + _, value, err = fscommon.ParseKeyValue(strings.TrimSuffix(contents, "\n")) if err != nil { return errors.Wrap(err, "failed to parse "+fileName) } diff --git a/libcontainer/cgroups/fs2/memory.go b/libcontainer/cgroups/fs2/memory.go index 81b28df75a5..cc8eaeb2eed 100644 --- a/libcontainer/cgroups/fs2/memory.go +++ b/libcontainer/cgroups/fs2/memory.go @@ -82,7 +82,7 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { sc := bufio.NewScanner(statsFile) for sc.Scan() { - t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + t, v, err := fscommon.ParseKeyValue(sc.Text()) if err != nil { return errors.Wrapf(err, "failed to parse memory.stat (%q)", sc.Text()) } diff --git a/libcontainer/cgroups/fscommon/utils.go b/libcontainer/cgroups/fscommon/utils.go index 2e4e837f2b8..235c9885996 100644 --- a/libcontainer/cgroups/fscommon/utils.go +++ b/libcontainer/cgroups/fscommon/utils.go @@ -35,22 +35,22 @@ func ParseUint(s string, base, bitSize int) (uint64, error) { return value, nil } -// GetCgroupParamKeyValue parses a space-separated "name value" kind of cgroup -// parameter and returns its components. For example, "io_service_bytes 1234" -// will return as "io_service_bytes", 1234. -func GetCgroupParamKeyValue(t string) (string, uint64, error) { - parts := strings.Fields(t) - switch len(parts) { - case 2: - value, err := ParseUint(parts[1], 10, 64) - if err != nil { - return "", 0, fmt.Errorf("unable to convert to uint64: %v", err) - } +// ParseKeyValue parses a space-separated "name value" kind of cgroup +// parameter and returns its key as a string, and its value as uint64 +// (ParseUint is used to convert the value). For example, +// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234. +func ParseKeyValue(t string) (string, uint64, error) { + parts := strings.SplitN(t, " ", 3) + if len(parts) != 2 { + return "", 0, fmt.Errorf("line %q is not in key value format", t) + } - return parts[0], value, nil - default: - return "", 0, ErrNotValidFormat + value, err := ParseUint(parts[1], 10, 64) + if err != nil { + return "", 0, fmt.Errorf("unable to convert to uint64: %v", err) } + + return parts[0], value, nil } // GetCgroupParamUint reads a single uint64 value from the specified cgroup file. From c54c3f852d11cef0f7a14f59c8bb355ba00d01e7 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 18 Feb 2021 20:45:46 -0800 Subject: [PATCH 06/10] libcontainer/notify_linux_v2: use fscommon.ReadFile This is to benefit from openat2() implementation, on kernels that support it. Theoretically this also improves security. Signed-off-by: Kir Kolyshkin --- libcontainer/notify_linux_v2.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/libcontainer/notify_linux_v2.go b/libcontainer/notify_linux_v2.go index cdab10ed609..1d81f38c2b7 100644 --- a/libcontainer/notify_linux_v2.go +++ b/libcontainer/notify_linux_v2.go @@ -3,19 +3,19 @@ package libcontainer import ( - "io/ioutil" "path/filepath" "strconv" "strings" "unsafe" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/pkg/errors" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) -func getValueFromCgroup(path, key string) (int, error) { - content, err := ioutil.ReadFile(path) +func getValueFromCgroup(path, file, key string) (int, error) { + content, err := fscommon.ReadFile(path, file) if err != nil { return 0, err } @@ -31,20 +31,18 @@ func getValueFromCgroup(path, key string) (int, error) { } func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, error) { - eventControlPath := filepath.Join(cgDir, evName) - cgEvPath := filepath.Join(cgDir, cgEvName) fd, err := unix.InotifyInit() if err != nil { return nil, errors.Wrap(err, "unable to init inotify") } // watching oom kill - evFd, err := unix.InotifyAddWatch(fd, eventControlPath, unix.IN_MODIFY) + evFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, evName), unix.IN_MODIFY) if err != nil { unix.Close(fd) return nil, errors.Wrap(err, "unable to add inotify watch") } // Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited - cgFd, err := unix.InotifyAddWatch(fd, cgEvPath, unix.IN_MODIFY) + cgFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, cgEvName), unix.IN_MODIFY) if err != nil { unix.Close(fd) return nil, errors.Wrap(err, "unable to add inotify watch") @@ -79,12 +77,12 @@ func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, err } switch int(rawEvent.Wd) { case evFd: - oom, err := getValueFromCgroup(eventControlPath, "oom_kill") + oom, err := getValueFromCgroup(cgDir, evName, "oom_kill") if err != nil || oom > 0 { ch <- struct{}{} } case cgFd: - pids, err := getValueFromCgroup(cgEvPath, "populated") + pids, err := getValueFromCgroup(cgDir, cgEvName, "populated") if err != nil || pids == 0 { return } From 9fa65f6607de3c448fe7ff24cde6110718c6d61d Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 19 Feb 2021 02:27:25 -0800 Subject: [PATCH 07/10] libct/cg/fscommon: add GetValueByKey Generalize the libct/getValueFromCgroup() as fscommon.GetValueByKey(), and document it. No changes other than using fscommon.ParseUint to convert the value. Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/fscommon/utils.go | 20 ++++++++++++++++++++ libcontainer/notify_linux_v2.go | 22 ++-------------------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/libcontainer/cgroups/fscommon/utils.go b/libcontainer/cgroups/fscommon/utils.go index 235c9885996..db0caded151 100644 --- a/libcontainer/cgroups/fscommon/utils.go +++ b/libcontainer/cgroups/fscommon/utils.go @@ -53,6 +53,26 @@ func ParseKeyValue(t string) (string, uint64, error) { return parts[0], value, nil } +// GetValueByKey reads a key-value pairs from the specified cgroup file, +// and returns a value of the specified key. ParseUint is used for value +// conversion. +func GetValueByKey(path, file, key string) (uint64, error) { + content, err := ReadFile(path, file) + if err != nil { + return 0, err + } + + lines := strings.Split(string(content), "\n") + for _, line := range lines { + arr := strings.Split(line, " ") + if len(arr) == 2 && arr[0] == key { + return ParseUint(arr[1], 10, 64) + } + } + + return 0, nil +} + // GetCgroupParamUint reads a single uint64 value from the specified cgroup file. // If the value read is "max", the math.MaxUint64 is returned. func GetCgroupParamUint(path, file string) (uint64, error) { diff --git a/libcontainer/notify_linux_v2.go b/libcontainer/notify_linux_v2.go index 1d81f38c2b7..dd0ec290ecf 100644 --- a/libcontainer/notify_linux_v2.go +++ b/libcontainer/notify_linux_v2.go @@ -4,8 +4,6 @@ package libcontainer import ( "path/filepath" - "strconv" - "strings" "unsafe" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" @@ -14,22 +12,6 @@ import ( "golang.org/x/sys/unix" ) -func getValueFromCgroup(path, file, key string) (int, error) { - content, err := fscommon.ReadFile(path, file) - if err != nil { - return 0, err - } - - lines := strings.Split(string(content), "\n") - for _, line := range lines { - arr := strings.Split(line, " ") - if len(arr) == 2 && arr[0] == key { - return strconv.Atoi(arr[1]) - } - } - return 0, nil -} - func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, error) { fd, err := unix.InotifyInit() if err != nil { @@ -77,12 +59,12 @@ func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, err } switch int(rawEvent.Wd) { case evFd: - oom, err := getValueFromCgroup(cgDir, evName, "oom_kill") + oom, err := fscommon.GetValueByKey(cgDir, evName, "oom_kill") if err != nil || oom > 0 { ch <- struct{}{} } case cgFd: - pids, err := getValueFromCgroup(cgDir, cgEvName, "populated") + pids, err := fscommon.GetValueByKey(cgDir, cgEvName, "populated") if err != nil || pids == 0 { return } From 7e137b90448533f25010e91f66a3d8681b936121 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 19 Feb 2021 11:48:44 -0800 Subject: [PATCH 08/10] libct/cg/fs2/hugetlb: use fscommon.GetValueByKey This makes the code simpler and more future-proof, in case any more values will appear in hugetlb.*.events. Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/fs2/hugetlb.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/libcontainer/cgroups/fs2/hugetlb.go b/libcontainer/cgroups/fs2/hugetlb.go index deeee34c5e1..feddc7aa043 100644 --- a/libcontainer/cgroups/fs2/hugetlb.go +++ b/libcontainer/cgroups/fs2/hugetlb.go @@ -4,7 +4,6 @@ package fs2 import ( "strconv" - "strings" "github.com/pkg/errors" @@ -45,14 +44,10 @@ func statHugeTlb(dirPath string, stats *cgroups.Stats) error { hugetlbStats.Usage = value fileName := "hugetlb." + pagesize + ".events" - contents, err := fscommon.ReadFile(dirPath, fileName) + value, err = fscommon.GetValueByKey(dirPath, fileName, "max") if err != nil { return errors.Wrap(err, "failed to read stats") } - _, value, err = fscommon.ParseKeyValue(strings.TrimSuffix(contents, "\n")) - if err != nil { - return errors.Wrap(err, "failed to parse "+fileName) - } hugetlbStats.Failcnt = value stats.HugetlbStats[pagesize] = hugetlbStats From 5d0ffbf9c81f3fbd21035c2690df4e9b7ce136f3 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Fri, 19 Feb 2021 02:53:27 -0800 Subject: [PATCH 09/10] runc start/run: report OOM In some cases, container init fails to start because it is killed by the kernel OOM killer. The errors returned by runc in such cases are semi-random and rather cryptic. Below are a few examples. On cgroup v1 + systemd cgroup driver: > process_linux.go:348: copying bootstrap data to pipe caused: write init-p: broken pipe > process_linux.go:352: getting the final child's pid from pipe caused: EOF On cgroup v2: > process_linux.go:495: container init caused: read init-p: connection reset by peer > process_linux.go:484: writing syncT 'resume' caused: write init-p: broken pipe This commits adds the OOM method to cgroup managers, which tells whether the container was OOM-killed. In case that has happened, the original error is discarded (unless --debug is set), and the new OOM error is reported instead: > ERRO[0000] container_linux.go:367: starting container process caused: container init was OOM-killed (memory limit too low?) Also, fix the rootless test cases that are failing because they expect an error in the first line, and we have an additional warning now: > unable to get oom kill count" error="no directory specified for memory.oom_control Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/cgroups.go | 3 +++ libcontainer/cgroups/fs/fs.go | 9 +++++++++ libcontainer/cgroups/fs2/fs2.go | 8 ++++++++ libcontainer/cgroups/systemd/v1.go | 4 ++++ libcontainer/cgroups/systemd/v2.go | 4 ++++ libcontainer/container_linux_test.go | 4 ++++ libcontainer/process_linux.go | 18 ++++++++++++++++++ tests/integration/cgroups.bats | 5 +++-- 8 files changed, 53 insertions(+), 2 deletions(-) diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go index a16a68e9796..c418fbae9af 100644 --- a/libcontainer/cgroups/cgroups.go +++ b/libcontainer/cgroups/cgroups.go @@ -48,4 +48,7 @@ type Manager interface { // Whether the cgroup path exists or not Exists() bool + + // OOMKillCount reports OOM kill count for the cgroup. + OOMKillCount() (uint64, error) } diff --git a/libcontainer/cgroups/fs/fs.go b/libcontainer/cgroups/fs/fs.go index a42ce4535e9..bede91e2fdc 100644 --- a/libcontainer/cgroups/fs/fs.go +++ b/libcontainer/cgroups/fs/fs.go @@ -9,6 +9,7 @@ import ( "sync" "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" "github.com/pkg/errors" @@ -421,3 +422,11 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) { func (m *manager) Exists() bool { return cgroups.PathExists(m.Path("devices")) } + +func OOMKillCount(path string) (uint64, error) { + return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill") +} + +func (m *manager) OOMKillCount() (uint64, error) { + return OOMKillCount(m.Path("memory")) +} diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index 3f0b9e0d7e1..733f9dcfdcf 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -257,3 +257,11 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) { func (m *manager) Exists() bool { return cgroups.PathExists(m.dirPath) } + +func OOMKillCount(path string) (uint64, error) { + return fscommon.GetValueByKey(path, "memory.events", "oom_kill") +} + +func (m *manager) OOMKillCount() (uint64, error) { + return OOMKillCount(m.dirPath) +} diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index 2ad1347e559..59a8b1e727e 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -450,3 +450,7 @@ func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) { func (m *legacyManager) Exists() bool { return cgroups.PathExists(m.Path("devices")) } + +func (m *legacyManager) OOMKillCount() (uint64, error) { + return fs.OOMKillCount(m.Path("memory")) +} diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index 6f142265947..65abc0592ad 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -495,3 +495,7 @@ func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) { func (m *unifiedManager) Exists() bool { return cgroups.PathExists(m.path) } + +func (m *unifiedManager) OOMKillCount() (uint64, error) { + return fs2.OOMKillCount(m.path) +} diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go index 73915997c19..2a9af6ad28e 100644 --- a/libcontainer/container_linux_test.go +++ b/libcontainer/container_linux_test.go @@ -55,6 +55,10 @@ func (m *mockCgroupManager) Exists() bool { return err == nil } +func (m *mockCgroupManager) OOMKillCount() (uint64, error) { + return 0, nil +} + func (m *mockCgroupManager) GetPaths() map[string]string { return m.paths } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 834268b9957..7e403c6af30 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -321,6 +321,24 @@ func (p *initProcess) start() (retErr error) { } defer func() { if retErr != nil { + // init might be killed by the kernel's OOM killer. + oom, err := p.manager.OOMKillCount() + if err != nil { + logrus.WithError(err).Warn("unable to get oom kill count") + } else if oom > 0 { + // Does not matter what the particular error was, + // its cause is most probably OOM, so report that. + const oomError = "container init was OOM-killed (memory limit too low?)" + + if logrus.GetLevel() >= logrus.DebugLevel { + // Only show the original error if debug is set, + // as it is not generally very useful. + retErr = newSystemErrorWithCause(retErr, oomError) + } else { + retErr = newSystemError(errors.New(oomError)) + } + } + // terminate the process to ensure we can remove cgroups if err := ignoreTerminateErrors(p.terminate()); err != nil { logrus.WithError(err).Warn("unable to terminate initProcess") diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index a319ed04218..38d0630471d 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -79,7 +79,7 @@ function setup() { runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions [ "$status" -eq 1 ] - [[ ${lines[0]} == *"permission denied"* ]] + [[ "$output" == *"applying cgroup configuration"*"permission denied"* ]] } @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" { @@ -92,7 +92,8 @@ function setup() { runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions [ "$status" -eq 1 ] - [[ ${lines[0]} == *"rootless needs no limits + no cgrouppath when no permission is granted for cgroups"* ]] || [[ ${lines[0]} == *"cannot set pids limit: container could not join or create cgroup"* ]] + [[ "$output" == *"rootless needs no limits + no cgrouppath when no permission is granted for cgroups"* ]] || + [[ "$output" == *"cannot set pids limit: container could not join or create cgroup"* ]] } @test "runc create (limits + cgrouppath + permission on the cgroup dir) succeeds" { From 38b2dd391d2647766816a09db9c349588e7a3f51 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 22 Feb 2021 17:13:47 -0800 Subject: [PATCH 10/10] runc exec: report possible OOM kill An exec may fail due to memory shortage (cgroup memory limits being too tight), and an error message provided in this case is clueless: > $ sudo ../runc/runc exec xx56 top > ERRO[0000] exec failed: container_linux.go:367: starting container process caused: read init-p: connection reset by peer Same as the previous commit for run/start, check the OOM kill counter and report an OOM kill. The differences from run are 1. The container is already running and OOM kill counter might not be zero. This is why we have to read the counter before exec and after it failed. 2. An unrelated OOM kill event might occur in parallel with our exec (and I see no way to find out which process was killed, except to parse kernel logs which seems excessive and not very reliable). This is why we report _possible_ OOM kill. With this commit, the error message looks like: > ERRO[0000] exec failed: container_linux.go:367: starting container process caused: process_linux.go:105: possibly OOM-killed caused: read init-p: connection reset by peer Signed-off-by: Kir Kolyshkin --- libcontainer/container_linux.go | 1 + libcontainer/process_linux.go | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 3dca29e4c3f..65e2eace8c8 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -570,6 +570,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP intelRdtPath: state.IntelRdtPath, messageSockPair: messageSockPair, logFilePair: logFilePair, + manager: c.cgroupManager, config: c.newInitConfig(p), process: p, bootstrapData: data, diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 7e403c6af30..6f2828770ee 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -65,6 +65,7 @@ type setnsProcess struct { logFilePair filePair cgroupPaths map[string]string rootlessCgroups bool + manager cgroups.Manager intelRdtPath string config *initConfig fds []string @@ -88,6 +89,8 @@ func (p *setnsProcess) signal(sig os.Signal) error { func (p *setnsProcess) start() (retErr error) { defer p.messageSockPair.parent.Close() + // get the "before" value of oom kill count + oom, _ := p.manager.OOMKillCount() err := p.cmd.Start() // close the write-side of the pipes (controlled by child) p.messageSockPair.child.Close() @@ -97,6 +100,10 @@ func (p *setnsProcess) start() (retErr error) { } defer func() { if retErr != nil { + if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom { + // Someone in this cgroup was killed, this _might_ be us. + retErr = newSystemErrorWithCause(retErr, "possibly OOM-killed") + } err := ignoreTerminateErrors(p.terminate()) if err != nil { logrus.WithError(err).Warn("unable to terminate setnsProcess")