From 1c9f152edafea505404c016bf7de4c21c2c78046 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Thu, 27 Jun 2024 14:48:49 +0200 Subject: [PATCH] criu swrk: return child error to caller In the podman CI we are seeing a weird flake during criu version detection[1]. The write to the socket just fails with broken pipe. The logical thing to assume here is that the child exited. However the current code never reports back the child error from wait. The cleanup error is now added to the returned error so the caller sees both. The output is not captured as this causes hangs when the fds are passed into child processes. As errors.Join is used from the std lib bump the minimum go version to 1.20. [1] https://github.com/containers/podman/issues/18856 Signed-off-by: Paul Holzinger --- main.go | 24 +++++++++++++++++------- phaul/client.go | 11 +++++++++-- test/main.go | 5 ++++- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/main.go b/main.go index 6906c9811..8f29d2ee2 100644 --- a/main.go +++ b/main.go @@ -61,13 +61,19 @@ func (c *Criu) Prepare() error { } // Cleanup cleans up -func (c *Criu) Cleanup() { +func (c *Criu) Cleanup() error { + var errs []error if c.swrkCmd != nil { - c.swrkSk.Close() + if err := c.swrkSk.Close(); err != nil { + errs = append(errs, err) + } c.swrkSk = nil - _ = c.swrkCmd.Wait() + if err := c.swrkCmd.Wait(); err != nil { + errs = append(errs, fmt.Errorf("criu swrk failed: %w", err)) + } c.swrkCmd = nil } + return errors.Join(errs...) } func (c *Criu) sendAndRecv(reqB []byte) ([]byte, int, error) { @@ -99,9 +105,7 @@ func (c *Criu) doSwrk(reqType rpc.CriuReqType, opts *rpc.CriuOpts, nfy Notify) e return nil } -func (c *Criu) doSwrkWithResp(reqType rpc.CriuReqType, opts *rpc.CriuOpts, nfy Notify, features *rpc.CriuFeatures) (*rpc.CriuResp, error) { - var resp *rpc.CriuResp - +func (c *Criu) doSwrkWithResp(reqType rpc.CriuReqType, opts *rpc.CriuOpts, nfy Notify, features *rpc.CriuFeatures) (resp *rpc.CriuResp, retErr error) { req := rpc.CriuReq{ Type: &reqType, Opts: opts, @@ -121,7 +125,13 @@ func (c *Criu) doSwrkWithResp(reqType rpc.CriuReqType, opts *rpc.CriuOpts, nfy N return nil, err } - defer c.Cleanup() + defer func() { + // append any cleanup errors to the returned error + err := c.Cleanup() + if err != nil { + retErr = errors.Join(retErr, err) + } + }() } for { diff --git a/phaul/client.go b/phaul/client.go index 8925ad1c8..e84ef6f1b 100644 --- a/phaul/client.go +++ b/phaul/client.go @@ -1,6 +1,7 @@ package phaul import ( + "errors" "fmt" "github.com/checkpoint-restore/go-criu/v7" @@ -55,7 +56,7 @@ func isLastIter(iter int, stats *stats.DumpStatsEntry, prevStats *stats.DumpStat } // Migrate function -func (pc *Client) Migrate() error { +func (pc *Client) Migrate() (retErr error) { criu := criu.MakeCriu() psi := rpc.CriuPageServerInfo{ Fd: proto.Int32(int32(pc.cfg.Memfd)), @@ -72,7 +73,13 @@ func (pc *Client) Migrate() error { return err } - defer criu.Cleanup() + defer func() { + // append any cleanup errors to the returned error + err := criu.Cleanup() + if err != nil { + retErr = errors.Join(retErr, err) + } + }() imgs, err := preparePhaulImages(pc.cfg.Wdir) if err != nil { diff --git a/test/main.go b/test/main.go index 4ec864858..a3043da65 100644 --- a/test/main.go +++ b/test/main.go @@ -202,7 +202,10 @@ func main() { log.Fatalln("dump failed: ", err) } - c.Cleanup() + err = c.Cleanup() + if err != nil { + log.Fatalln("cleanup failed: ", err) + } case "restore": log.Println("Restoring") img, err := os.Open(os.Args[2])