diff --git a/contrib/rootless-cni-infra/Containerfile b/contrib/rootless-cni-infra/Containerfile deleted file mode 100644 index 4324f39d2b..0000000000 --- a/contrib/rootless-cni-infra/Containerfile +++ /dev/null @@ -1,36 +0,0 @@ -ARG GOLANG_VERSION=1.15 -ARG ALPINE_VERSION=3.12 -ARG CNI_VERSION=v0.8.0 -ARG CNI_PLUGINS_VERSION=v0.8.7 -ARG DNSNAME_VERSION=v1.1.1 - -FROM golang:${GOLANG_VERSION}-alpine${ALPINE_VERSION} AS golang-base -RUN apk add --no-cache git - -FROM golang-base AS cnitool -RUN git clone https://github.com/containernetworking/cni /go/src/github.com/containernetworking/cni -WORKDIR /go/src/github.com/containernetworking/cni -ARG CNI_VERSION -RUN git checkout ${CNI_VERSION} -RUN go build -o /cnitool ./cnitool - -FROM golang-base AS dnsname -RUN git clone https://github.com/containers/dnsname /go/src/github.com/containers/dnsname -WORKDIR /go/src/github.com/containers/dnsname -ARG DNSNAME_VERSION -RUN git checkout ${DNSNAME_VERSION} -RUN go build -o /dnsname ./plugins/meta/dnsname - -FROM alpine:${ALPINE_VERSION} -RUN apk add --no-cache curl dnsmasq iptables ip6tables iproute2 -ARG TARGETARCH -ARG CNI_PLUGINS_VERSION -RUN mkdir -p /opt/cni/bin && \ - curl -fsSL https://github.com/containernetworking/plugins/releases/download/${CNI_PLUGINS_VERSION}/cni-plugins-linux-${TARGETARCH}-${CNI_PLUGINS_VERSION}.tgz | tar xz -C /opt/cni/bin -COPY --from=cnitool /cnitool /usr/local/bin -COPY --from=dnsname /dnsname /opt/cni/bin -COPY rootless-cni-infra /usr/local/bin -ENV CNI_PATH=/opt/cni/bin -CMD ["sleep", "infinity"] - -ENV ROOTLESS_CNI_INFRA_VERSION=5 diff --git a/contrib/rootless-cni-infra/README.md b/contrib/rootless-cni-infra/README.md index c43b4cf491..dc21791a7e 100644 --- a/contrib/rootless-cni-infra/README.md +++ b/contrib/rootless-cni-infra/README.md @@ -7,19 +7,16 @@ Infra container for CNI-in-slirp4netns. When a CNI network is specified for `podman run` in rootless mode, Podman launches the `rootless-cni-infra` container to execute CNI plugins inside slirp4netns. The infra container is created per user, by executing an equivalent of: -`podman run -d --name rootless-cni-infra --pid=host --privileged -v $HOME/.config/cni/net.d:/etc/cni/net.d rootless-cni-infra`. +`podman run -d --name rootless-cni-infra --pid=host --privileged --rootfs /`. The infra container is automatically deleted when no CNI network is in use. Podman then allocates a CNI netns in the infra container, by executing an equivalent of: -`podman exec rootless-cni-infra rootless-cni-infra alloc $CONTAINER_ID $NETWORK_NAME $POD_NAME`. +`podman exec rootless-cni-infra rootless-cni-infra alloc $CONTAINER_ID $NETWORK_NAME`. The allocated netns is deallocated when the container is being removed, by executing an equivalent of: `podman exec rootless-cni-infra rootless-cni-infra dealloc $CONTAINER_ID $NETWORK_NAME`. -The container images live on `quay.io/libpod/rootless-cni-infra`. The tags have the format `$version-$architecture`. Please make sure to increase the version number in the Containerfile (i.e., `ROOTLESS_CNI_INFRA_VERSION`) when applying changes to this directory. After committing the changes, upload the image(s) with the corresponding tag. - ## Directory layout -* `/run/rootless-cni-infra/${CONTAINER_ID}/pid`: PID of the `sleep infinity` process that corresponds to the allocated netns -* `/run/rootless-cni-infra/${CONTAINER_ID}/attached/${NETWORK_NAME}`: CNI result +* `/run/rootless-cni-infra/${CONTAINER_ID}/pid`: PID of the `sleep` process that corresponds to the allocated netns * `/run/rootless-cni-infra/${CONTAINER_ID}/attached-args/${NETWORK_NAME}`: CNI args diff --git a/contrib/rootless-cni-infra/rootless-cni-infra b/contrib/rootless-cni-infra/rootless-cni-infra deleted file mode 100755 index cceb8d817a..0000000000 --- a/contrib/rootless-cni-infra/rootless-cni-infra +++ /dev/null @@ -1,181 +0,0 @@ -#!/bin/sh -set -eu - -ARG0="$0" -BASE="/run/rootless-cni-infra" - -wait_unshare_net() { - pid="$1" - # NOTE: busybox shell doesn't support the `for ((i=0; i < $MAX; i++)); do foo; done` statement - i=0 - while :; do - if [ "$(readlink /proc/self/ns/net)" != "$(readlink /proc/${pid}/ns/net)" ]; then - break - fi - sleep 0.1 - if [ $i -ge 10 ]; then - echo >&2 "/proc/${pid}/ns/net cannot be unshared" - exit 1 - fi - i=$((i + 1)) - done -} - -# CLI subcommand: "alloc $CONTAINER_ID $NETWORK_NAME $POD_NAME $IP $MAC $CAP_ARGS" -cmd_entrypoint_alloc() { - if [ "$#" -ne 6 ]; then - echo >&2 "Usage: $ARG0 alloc CONTAINER_ID NETWORK_NAME POD_NAME IP MAC CAP_ARGS" - exit 1 - fi - - ID="$1" - NET="$2" - K8S_POD_NAME="$3" - IP="$4" - MAC="$5" - CAP_ARGS="$6" - - dir="${BASE}/${ID}" - mkdir -p "${dir}/attached" "${dir}/attached-args" - - pid="" - if [ -f "${dir}/pid" ]; then - pid=$(cat "${dir}/pid") - else - unshare -n sleep infinity & - pid="$!" - wait_unshare_net "${pid}" - echo "${pid}" >"${dir}/pid" - nsenter -t "${pid}" -n ip link set lo up - fi - CNI_ARGS="IgnoreUnknown=1;K8S_POD_NAME=${K8S_POD_NAME}" - if [ "$IP" ]; then - CNI_ARGS="$CNI_ARGS;IP=${IP}" - fi - if [ "$MAC" ]; then - CNI_ARGS="$CNI_ARGS;MAC=${MAC}" - fi - if [ "$CAP_ARGS" ]; then - CAP_ARGS="$CAP_ARGS" - fi - nwcount=$(find "${dir}/attached" -type f | wc -l) - CNI_IFNAME="eth${nwcount}" - export CNI_ARGS CNI_IFNAME CAP_ARGS - cnitool add "${NET}" "/proc/${pid}/ns/net" >"${dir}/attached/${NET}" - echo "${CNI_ARGS}" >"${dir}/attached-args/${NET}" - - # return the result - ns="/proc/${pid}/ns/net" - echo "{\"ns\":\"${ns}\"}" -} - -# CLI subcommand: "dealloc $CONTAINER_ID $NETWORK_NAME" -cmd_entrypoint_dealloc() { - if [ "$#" -ne 2 ]; then - echo >&2 "Usage: $ARG0 dealloc CONTAINER_ID NETWORK_NAME" - exit 1 - fi - - ID=$1 - NET=$2 - - dir="${BASE}/${ID}" - if [ ! -f "${dir}/pid" ]; then - exit 0 - fi - pid=$(cat "${dir}/pid") - if [ -f "${dir}/attached-args/${NET}" ]; then - CNI_ARGS=$(cat "${dir}/attached-args/${NET}") - export CNI_ARGS - fi - cnitool del "${NET}" "/proc/${pid}/ns/net" - rm -f "${dir}/attached/${NET}" "${dir}/attached-args/${NET}" - - nwcount=$(find "${dir}/attached" -type f | wc -l) - if [ "${nwcount}" = 0 ]; then - kill -9 "${pid}" - rm -rf "${dir}" - fi - - # return empty json - echo "{}" -} - -# CLI subcommand: "is-idle" -cmd_entrypoint_is_idle() { - if [ ! -d ${BASE} ]; then - echo '{"idle": true}' - elif [ -z "$(ls -1 ${BASE})" ]; then - echo '{"idle": true}' - else - echo '{"idle": false}' - fi -} - -# CLI subcommand: "print-cni-result $CONTAINER_ID $NETWORK_NAME" -cmd_entrypoint_print_cni_result() { - if [ "$#" -ne 2 ]; then - echo >&2 "Usage: $ARG0 print-cni-result CONTAINER_ID NETWORK_NAME" - exit 1 - fi - - ID=$1 - NET=$2 - - # the result shall be CNI JSON - cat "${BASE}/${ID}/attached/${NET}" -} - -# CLI subcommand: "print-netns-path $CONTAINER_ID" -cmd_entrypoint_print_netns_path() { - if [ "$#" -ne 1 ]; then - echo >&2 "Usage: $ARG0 print-netns-path CONTAINER_ID" - exit 1 - fi - - ID=$1 - - pid=$(cat "${BASE}/${ID}/pid") - path="/proc/${pid}/ns/net" - - # return the result - echo "{\"path\":\"${path}\"}" -} - -# CLI subcommand: "help" -cmd_entrypoint_help() { - echo "Usage: ${ARG0} COMMAND" - echo - echo "Rootless CNI Infra container" - echo - echo "Commands:" - echo " alloc Allocate a netns" - echo " dealloc Deallocate a netns" - echo " is-idle Print whether the infra container is idle" - echo " print-cni-result Print CNI result" - echo " print-netns-path Print netns path" - echo " help Print help" - echo " version Print version" -} - -# CLI subcommand: "version" -cmd_entrypoint_version() { - echo "{\"version\": \"${ROOTLESS_CNI_INFRA_VERSION}\"}" -} - -# parse args -command="${1:-}" -if [ -z "$command" ]; then - echo >&2 "No command was specified. Run \`${ARG0} help\` to see the usage." - exit 1 -fi - -command_func=$(echo "cmd_entrypoint_${command}" | sed -e "s/-/_/g") -if ! command -v "${command_func}" >/dev/null 2>&1; then - echo >&2 "Unknown command: ${command}. Run \`${ARG0} help\` to see the usage." - exit 1 -fi - -# start the command func -shift -"${command_func}" "$@" diff --git a/libpod/network/create.go b/libpod/network/create.go index c58d625756..210b1a44b8 100644 --- a/libpod/network/create.go +++ b/libpod/network/create.go @@ -11,7 +11,6 @@ import ( "github.com/containernetworking/cni/pkg/version" "github.com/containers/common/pkg/config" "github.com/containers/podman/v2/pkg/domain/entities" - "github.com/containers/podman/v2/pkg/rootless" "github.com/containers/podman/v2/pkg/util" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -223,9 +222,8 @@ func createBridge(name string, options entities.NetworkCreateOptions, runtimeCon plugins = append(plugins, NewPortMapPlugin()) plugins = append(plugins, NewFirewallPlugin()) plugins = append(plugins, NewTuningPlugin()) - // if we find the dnsname plugin or are rootless, we add configuration for it - // the rootless-cni-infra container has the dnsname plugin always installed - if (HasDNSNamePlugin(runtimeConfig.Network.CNIPluginDirs) || rootless.IsRootless()) && !options.DisableDNS { + // if we find the dnsname plugin installed include it in the config + if HasDNSNamePlugin(runtimeConfig.Network.CNIPluginDirs) && !options.DisableDNS { if options.Internal { logrus.Warnf("dnsname and --internal networks are incompatible. dnsname plugin not configured for network %s", name) } else { diff --git a/libpod/rootless_cni_linux.go b/libpod/rootless_cni_linux.go index 94ae062aa4..0d525989d2 100644 --- a/libpod/rootless_cni_linux.go +++ b/libpod/rootless_cni_linux.go @@ -3,19 +3,21 @@ package libpod import ( + "bufio" "bytes" "context" "io" + "os" "path/filepath" - "runtime" + "strconv" cnitypes "github.com/containernetworking/cni/pkg/types/current" "github.com/containernetworking/plugins/pkg/ns" "github.com/containers/podman/v2/libpod/define" - "github.com/containers/podman/v2/libpod/image" - "github.com/containers/podman/v2/pkg/env" - "github.com/containers/podman/v2/pkg/util" + "github.com/containers/podman/v2/libpod/network" + rootlesscni "github.com/containers/podman/v2/pkg/rootless/cni" "github.com/containers/storage/pkg/lockfile" + "github.com/containers/storage/pkg/mount" "github.com/hashicorp/go-multierror" spec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/runtime-tools/generate" @@ -23,16 +25,42 @@ import ( "github.com/sirupsen/logrus" ) -// Built from ../contrib/rootless-cni-infra. -var rootlessCNIInfraImage = map[string]string{ - "amd64": "quay.io/libpod/rootless-cni-infra@sha256:adf352454666f7ce9ca3e1098448b5ee18f89c4516471ec99447ec9ece917f36", // 5-amd64 -} - const ( - rootlessCNIInfraContainerNamespace = "podman-system" - rootlessCNIInfraContainerName = "rootless-cni-infra" + rootlessCNIInfraContainerNamespace = "podman-system" + rootlessCNIInfraContainerName = "rootless-cni-infra" + rootlessCNIInfraContainerVersionLabelName = "rootless-cni-infra-container-version" ) +func getRootlessCNIConfig(c *Container, network string) (*rootlesscni.Config, error) { + conf := rootlesscni.Config{ + ID: c.ID(), + Network: network, + CNIPodName: getCNIPodName(c), + PluginPaths: c.runtime.config.Network.CNIPluginDirs, + NetConfPath: c.runtime.config.Network.NetworkConfigDir, + } + // add static ip if given + if c.config.StaticIP != nil { + conf.IP = c.config.StaticIP.String() + } + // add static mac if given + if c.config.StaticMAC != nil { + conf.MAC = c.config.StaticMAC.String() + } + // add aliases as CapabilityArgs + aliases, err := c.runtime.state.GetAllNetworkAliases(c) + if err != nil { + return nil, err + } + conf.Aliases = aliases + if eth, exists := c.state.NetInterfaceDescriptions.getInterfaceByName(network); exists { + conf.InterfaceName = eth + } else { + return nil, errors.Errorf("no network interface name for %s", network) + } + return &conf, nil +} + // AllocRootlessCNI allocates a CNI netns inside the rootless CNI infra container. // Locks "rootless-cni-infra.lck". // @@ -47,6 +75,20 @@ func AllocRootlessCNI(ctx context.Context, c *Container) (ns.NetNS, []*cnitypes. if len(networks) == 0 { return nil, nil, errors.New("rootless CNI networking requires that the container has joined at least one CNI network") } + // check early that all given networks exists + for _, nw := range networks { + exists, err := network.Exists(c.runtime.config, nw) + if err != nil { + return nil, nil, err + } + if !exists { + return nil, nil, errors.Errorf("CNI network %q not found", nw) + } + } + // Update container map of interface descriptions + if err := c.setupNetworkDescriptions(networks); err != nil { + return nil, nil, err + } l, err := getRootlessCNIInfraLock(c.runtime) if err != nil { return nil, nil, err @@ -57,34 +99,14 @@ func AllocRootlessCNI(ctx context.Context, c *Container) (ns.NetNS, []*cnitypes. if err != nil { return nil, nil, err } - k8sPodName := getCNIPodName(c) // passed to CNI as K8S_POD_NAME - ip := "" - if c.config.StaticIP != nil { - ip = c.config.StaticIP.String() - } - mac := "" - if c.config.StaticMAC != nil { - mac = c.config.StaticMAC.String() - } - aliases, err := c.runtime.state.GetAllNetworkAliases(c) - if err != nil { - return nil, nil, err - } - capArgs := "" - // add network aliases json encoded as capabilityArgs for cni - if len(aliases) > 0 { - capabilityArgs := make(map[string]interface{}) - capabilityArgs["aliases"] = aliases - b, err := json.Marshal(capabilityArgs) + cniResults := make([]*cnitypes.Result, len(networks)) + for i, nw := range networks { + rootlessCNIConf, err := getRootlessCNIConfig(c, nw) if err != nil { return nil, nil, err } - capArgs = string(b) - } - cniResults := make([]*cnitypes.Result, len(networks)) - for i, nw := range networks { - cniRes, err := rootlessCNIInfraCallAlloc(infra, c.ID(), nw, k8sPodName, ip, mac, capArgs) + cniRes, err := rootlessCNIInfraCallAlloc(infra, rootlessCNIConf) if err != nil { return nil, nil, err } @@ -124,7 +146,11 @@ func DeallocRootlessCNI(ctx context.Context, c *Container) error { } var errs *multierror.Error for _, nw := range networks { - err := rootlessCNIInfraCallDealloc(infra, c.ID(), nw) + rootlessCNIConf, err := getRootlessCNIConfig(c, nw) + if err != nil { + return err + } + err = rootlessCNIInfraCallDealloc(infra, rootlessCNIConf) if err != nil { errs = multierror.Append(errs, err) } @@ -134,11 +160,21 @@ func DeallocRootlessCNI(ctx context.Context, c *Container) error { logrus.Warn(err) } logrus.Debugf("rootless CNI: removing infra container %q", infra.ID()) + // Kill the infra container. There is no need to cleanup files because + // are stored in tmpfs so we can just sigkill it. It is important to kill the + // container before we remove it otherwise we have locking issues. + if err := infra.Kill(9); err != nil { + logrus.Error(err) + } infra.lock.Lock() defer infra.lock.Unlock() if err := c.runtime.removeContainer(ctx, infra, true, false, true); err != nil { return err } + rootfs := filepath.Join(c.runtime.GetStore().RunRoot(), rootlessCNIInfraContainerName) + if err := mount.RecursiveUnmount(rootfs); err != nil { + return errors.Wrapf(err, "failed to unmount rootfs for %s", rootlessCNIInfraContainerName) + } logrus.Debugf("rootless CNI: removed infra container %q", infra.ID()) } return errs.ErrorOrNil() @@ -161,57 +197,94 @@ func getCNIPodName(c *Container) string { return c.Name() } -func rootlessCNIInfraCallAlloc(infra *Container, id, nw, k8sPodName, ip, mac, capArgs string) (*cnitypes.Result, error) { - logrus.Debugf("rootless CNI: alloc %q, %q, %q, %q, %q, %q", id, nw, k8sPodName, ip, mac, capArgs) +func rootlessCNIInfraCallAlloc(infra *Container, cfg *rootlesscni.Config) (*cnitypes.Result, error) { + logrus.Debugf("rootless CNI: alloc %v", cfg) var err error + var cniRes cnitypes.Result + var cniResBytes []byte + labels := infra.Labels() + // we might want to check for the version here but for now the existence of the label is fine + if _, ok := labels[rootlessCNIInfraContainerVersionLabelName]; ok { + bytes, err := json.Marshal(cfg) + if err != nil { + return nil, err + } + cniResBytes, err = rootlessCNIInfraExec(infra, bytes, "alloc") + if err != nil { + return nil, err + } + } else { + // old rootless-cni-infra container api + // keep for backwarts compatibility with previous version to support live migration + // TODO: remove this in a future release maybe 4.0? - _, err = rootlessCNIInfraExec(infra, "alloc", id, nw, k8sPodName, ip, mac, capArgs) - if err != nil { - return nil, err - } - cniResStr, err := rootlessCNIInfraExec(infra, "print-cni-result", id, nw) - if err != nil { - return nil, err + // add network aliases json encoded as capabilityArgs for cni + capArgs := "" + if len(cfg.Aliases) > 0 { + capabilityArgs := make(map[string]interface{}) + capabilityArgs["aliases"] = cfg.Aliases + b, err := json.Marshal(capabilityArgs) + if err != nil { + return nil, err + } + capArgs = string(b) + } + _, err = rootlessCNIInfraExec(infra, nil, "alloc", cfg.ID, cfg.Network, cfg.CNIPodName, cfg.IP, cfg.MAC, capArgs) + if err != nil { + return nil, err + } + cniResBytes, err = rootlessCNIInfraExec(infra, nil, "print-cni-result", cfg.ID, cfg.Network) + if err != nil { + return nil, err + } } - var cniRes cnitypes.Result - if err := json.Unmarshal([]byte(cniResStr), &cniRes); err != nil { - return nil, errors.Wrapf(err, "unmarshaling as cnitypes.Result: %q", cniResStr) + if err := json.Unmarshal(cniResBytes, &cniRes); err != nil { + return nil, errors.Wrapf(err, "unmarshaling as cnitypes.Result: %q", string(cniResBytes)) } return &cniRes, nil } -func rootlessCNIInfraCallDealloc(infra *Container, id, nw string) error { - logrus.Debugf("rootless CNI: dealloc %q, %q", id, nw) - _, err := rootlessCNIInfraExec(infra, "dealloc", id, nw) +func rootlessCNIInfraCallDealloc(infra *Container, cfg *rootlesscni.Config) error { + logrus.Debugf("rootless CNI: dealloc %v", cfg) + var err error + labels := infra.Labels() + // we might want to check for the version here but for now the existence of the label is fine + if _, ok := labels[rootlessCNIInfraContainerVersionLabelName]; ok { + var bytes []byte + bytes, err = json.Marshal(cfg) + if err != nil { + return err + } + _, err = rootlessCNIInfraExec(infra, bytes, "dealloc") + } else { + // old rootless-cni-infra container api + // keep for backwarts compatibility with previous version to support live migration + // TODO: remove this in a future release maybe 4.0? + _, err = rootlessCNIInfraExec(infra, nil, "dealloc", cfg.ID, cfg.Network) + } return err } func rootlessCNIInfraIsIdle(infra *Container) (bool, error) { - type isIdle struct { - Idle bool `json:"idle"` - } - resStr, err := rootlessCNIInfraExec(infra, "is-idle") + resBytes, err := rootlessCNIInfraExec(infra, nil, "is-idle") if err != nil { return false, err } - var res isIdle - if err := json.Unmarshal([]byte(resStr), &res); err != nil { - return false, errors.Wrapf(err, "unmarshaling as isIdle: %q", resStr) + var res rootlesscni.IsIdle + if err := json.Unmarshal(resBytes, &res); err != nil { + return false, errors.Wrapf(err, "unmarshaling as IsIdle: %q", string(resBytes)) } return res.Idle, nil } -func rootlessCNIInfraGetNS(infra *Container, id string) (ns.NetNS, error) { - type printNetnsPath struct { - Path string `json:"path"` - } - resStr, err := rootlessCNIInfraExec(infra, "print-netns-path", id) +func rootlessCNIInfraGetNS(infra *Container, cid string) (ns.NetNS, error) { + resBytes, err := rootlessCNIInfraExec(infra, nil, "print-netns-path", cid) if err != nil { return nil, err } - var res printNetnsPath - if err := json.Unmarshal([]byte(resStr), &res); err != nil { - return nil, errors.Wrapf(err, "unmarshaling as printNetnsPath: %q", resStr) + var res rootlesscni.PrintNetnsPath + if err := json.Unmarshal(resBytes, &res); err != nil { + return nil, errors.Wrapf(err, "unmarshaling as PrintNetnsPath: %q", string(resBytes)) } nsObj, err := ns.GetNS(res.Path) if err != nil { @@ -250,6 +323,12 @@ func ensureRootlessCNIInfraContainerRunning(ctx context.Context, r *Runtime) (*C logrus.Debugf("rootless CNI: infra container %q is already running", c.ID()) return c, nil } + // we have to mount the rootfs before we start it + rootfs := filepath.Join(r.GetStore().RunRoot(), rootlessCNIInfraContainerName) + err = mountRootlessCNIINfraRootfs(rootfs) + if err != nil { + return nil, err + } logrus.Debugf("rootless CNI: infra container %q is %q, being started", c.ID(), st.State) if err := c.initAndStart(ctx); err != nil { return nil, err @@ -259,18 +338,6 @@ func ensureRootlessCNIInfraContainerRunning(ctx context.Context, r *Runtime) (*C } func startRootlessCNIInfraContainer(ctx context.Context, r *Runtime) (*Container, error) { - imageName, ok := rootlessCNIInfraImage[runtime.GOARCH] - if !ok { - return nil, errors.Errorf("cannot find rootless-podman-network-sandbox image for %s", runtime.GOARCH) - } - logrus.Debugf("rootless CNI: ensuring image %q to exist", imageName) - newImage, err := r.ImageRuntime().New(ctx, imageName, "", "", nil, nil, - image.SigningOptions{}, nil, util.PullImageMissing) - if err != nil { - return nil, err - } - logrus.Debugf("rootless CNI: image %q is ready", imageName) - g, err := generate.New("linux") if err != nil { return nil, err @@ -281,46 +348,72 @@ func startRootlessCNIInfraContainer(ctx context.Context, r *Runtime) (*Container return nil, err } g.RemoveMount("/proc") - procMount := spec.Mount{ - Destination: "/proc", - Type: "bind", - Source: "/proc", - Options: []string{"rbind", "nosuid", "noexec", "nodev"}, + + // need writable /run + run := spec.Mount{ + Destination: "/run", + Type: "tmpfs", + Source: "none", + Options: []string{"rw", "nosuid", "nodev"}, + } + g.AddMount(run) + + // mount /var as tmpfs + // On ungraceful shutdown cni leaves the ip allocation files in place. + // This causes issues when we try to use containers with the same ip again. + // The best way to clean them up is using a tmpfs mount. These files do not have to + // be persistent since the network namespace is destroyed anyway if the container exits. + // CNI tries to write to /var/lib/cni however we cannot mount there because + // it might not exists and we have no permission to create this directory. + cni := spec.Mount{ + Destination: "/var", + Type: "tmpfs", + Source: "none", + Options: []string{"rw", "nosuid", "nodev"}, + } + g.AddMount(cni) + + g.SetProcessArgs([]string{rootlesscni.InfraCmd, "sleep"}) + + // get the current path this executable so we can mount it + podmanexe, err := os.Executable() + if err != nil { + return nil, err } - g.AddMount(procMount) - // Mount CNI networks - etcCNINetD := spec.Mount{ - Destination: "/etc/cni/net.d", + podman := spec.Mount{ + // mount with different name to trigger the reexec for rooless-cni-infra + Destination: rootlesscni.InfraCmd, Type: "bind", - Source: r.config.Network.NetworkConfigDir, + Source: podmanexe, Options: []string{"ro", "bind"}, } - g.AddMount(etcCNINetD) + g.AddMount(podman) - inspectData, err := newImage.Inspect(ctx) + rootfs := filepath.Join(r.GetStore().RunRoot(), rootlessCNIInfraContainerName) + err = mountRootlessCNIINfraRootfs(rootfs) if err != nil { return nil, err } - imageEnv, err := env.ParseSlice(inspectData.Config.Env) - if err != nil { - return nil, err - } - for k, v := range imageEnv { - g.AddProcessEnv(k, v) - } - if len(inspectData.Config.Cmd) == 0 { - return nil, errors.Errorf("rootless CNI infra image %q has no command specified", imageName) + + g.SetRootReadonly(true) + g.SetHostname(rootlessCNIInfraContainerName) + + infraLabels := map[string]string{ + rootlessCNIInfraContainerVersionLabelName: strconv.Itoa(rootlesscni.Version), } - g.SetProcessArgs(inspectData.Config.Cmd) - var options []CtrCreateOption - options = append(options, WithRootFSFromImage(newImage.ID(), imageName, imageName)) - options = append(options, WithCtrNamespace(rootlessCNIInfraContainerNamespace)) - options = append(options, WithName(rootlessCNIInfraContainerName)) - options = append(options, WithPrivileged(true)) - options = append(options, WithSecLabels([]string{"disable"})) - options = append(options, WithRestartPolicy("always")) - options = append(options, WithNetNS(nil, false, "slirp4netns", nil)) + options := []CtrCreateOption{ + WithRootFS(rootfs), + WithCtrNamespace(rootlessCNIInfraContainerNamespace), + WithName(rootlessCNIInfraContainerName), + WithPrivileged(true), + // label=disable doesn't work correct for a rootfs mount + // set labels manually to unconfined + WithSecLabels([]string{"user:unconfined_u", "role:system_r", "type:unconfined_t"}), + WithRestartPolicy("always"), + WithNetNS(nil, false, "slirp4netns", nil), + WithLabels(infraLabels), + } c, err := r.NewContainer(ctx, g.Config, options...) if err != nil { return nil, err @@ -334,14 +427,39 @@ func startRootlessCNIInfraContainer(ctx context.Context, r *Runtime) (*Container return c, nil } -func rootlessCNIInfraExec(c *Container, args ...string) (string, error) { - cmd := "rootless-cni-infra" +func mountRootlessCNIINfraRootfs(rootfs string) error { + if err := os.MkdirAll(rootfs, 0700); err != nil { + return err + } + // bind mount the rootfs recursive in the userns + // only the root will be read-only + if err := mount.Mount("/", rootfs, "bind", "rbind,rprivate,ro"); err != nil { + return errors.Wrapf(err, "failed to mount rootfs for %s", rootlessCNIInfraContainerName) + } + return nil +} + +func rootlessCNIInfraExec(c *Container, stdin []byte, args ...string) ([]byte, error) { + cmd := rootlesscni.InfraCmd + labels := c.Labels() + if _, ok := labels[rootlessCNIInfraContainerVersionLabelName]; !ok { + // the old infra container had a different exec cmd + // change it for backwarts compatibility + cmd = rootlessCNIInfraContainerName + } var ( outB bytes.Buffer errB bytes.Buffer streams define.AttachStreams config ExecConfig ) + + if len(stdin) > 0 { + logrus.Debugf("rootlessCNIInfraExec: stdin=%s", string(stdin)) + r := bufio.NewReader(bytes.NewReader(stdin)) + streams.InputStream = r + streams.AttachInput = true + } streams.OutputStream = &nopWriteCloser{Writer: &outB} streams.ErrorStream = &nopWriteCloser{Writer: &errB} streams.AttachOutput = true @@ -354,13 +472,13 @@ func rootlessCNIInfraExec(c *Container, args ...string) (string, error) { logrus.Debugf("rootlessCNIInfraExec: c.ID()=%s, config=%+v, streams=%v, end (code=%d, err=%v)", c.ID(), config, streams, code, err) if err != nil { - return "", err + return nil, err } if code != 0 { - return "", errors.Errorf("command %s %v in container %s failed with status %d, stdout=%q, stderr=%q", + return nil, errors.Errorf("command %s %v in container %s failed with status %d, stdout=%q, stderr=%q", cmd, args, c.ID(), code, outB.String(), errB.String()) } - return outB.String(), nil + return outB.Bytes(), nil } type nopWriteCloser struct { diff --git a/pkg/rootless/cni/rootless_cni.go b/pkg/rootless/cni/rootless_cni.go new file mode 100644 index 0000000000..9d1e3b23a1 --- /dev/null +++ b/pkg/rootless/cni/rootless_cni.go @@ -0,0 +1,391 @@ +package cni + +import ( + "context" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "os" + "path" + "strconv" + "syscall" + "time" + + "github.com/containernetworking/cni/libcni" + "github.com/containers/storage/pkg/reexec" + "github.com/pkg/errors" + "github.com/vishvananda/netlink" +) + +const ( + // InfraCmd - always use a absolute path we should not rely on $PATH + // this also has to be a user writable location and /run is writable for the user + InfraCmd = "/run/rootless-cni-infra-exe" + infraCreateNetNSCmd = "rootless-cni-infra-create-netns" + basePath = "/run/rootless-cni-infra" + // Version - you should bump the Version if you do breaking changes to this script + Version = 6 +) + +// Config passed via stdin as json +type Config struct { + // ID - container ID + ID string + // Network - network name + Network string + // CNIPodName - name used for the dns entry by the dnsname plugin + CNIPodName string + // IP - static IP address + IP string + // MAC - static mac address + MAC string + // Aliases - network aliases, further dns entries for the dnsname plugin + Aliases map[string][]string + // InterfaceName - network interface name in the container for this network (e.g eth0) + InterfaceName string + // PluginPaths - search paths for the cni plugins + PluginPaths []string + // NetConfPath - path where the cni config files are located + NetConfPath string +} + +// PrintNetnsPath is returned by print-netns-path as json +type PrintNetnsPath struct { + Path string `json:"path"` +} + +// IsIdle is returned by is-idle as json +type IsIdle struct { + Idle bool `json:"idle"` +} + +func printErrorf(format string, a ...interface{}) { + fmt.Fprintf(os.Stderr, "Error: "+format+"\n", a...) +} + +func printJSONResult(v interface{}) { + b, err := json.Marshal(v) + if err != nil { + printErrorf("%s", err) + } + fmt.Println(string(b)) +} + +func init() { + reexec.Register(InfraCmd, func() { + if len(os.Args) < 2 { + exit(errors.Errorf("%s requires at least one arg", InfraCmd)) + } + + switch os.Args[1] { + case "alloc": + alloc() + + case "dealloc": + dealloc() + + case "is-idle": + idle := IsIdle{ + Idle: false, + } + empty, err := dirIsEmpty(basePath) + if os.IsNotExist(err) || empty { + idle.Idle = true + } else if err != nil { + printErrorf("%s", err) + } + printJSONResult(idle) + + case "print-netns-path": + if len(os.Args) != 3 { + exit(errors.Errorf("%s print-netns-path requires one arg", InfraCmd)) + } + pidfile := path.Join(basePath, os.Args[2], "pid") + path, err := getNetNamespacePath(pidfile) + if err != nil { + exit(err) + } + var netns PrintNetnsPath + netns.Path = path + printJSONResult(netns) + + case "sleep": + // sleep subcommand used to keep the namespace alive + // sleep max duration + time.Sleep(time.Duration(1<<63 - 1)) + + default: + exit(errors.Errorf("Unknown command: %s %s", InfraCmd, os.Args[1])) + } + }) + + reexec.Register(infraCreateNetNSCmd, func() { + if len(os.Args) != 2 { + exit(errors.Errorf("%s requires one arg", infraCreateNetNSCmd)) + } + pidfile := os.Args[1] + if err := os.MkdirAll(path.Dir(pidfile), 0700); err != nil { + exit(err) + } + // create new net namespace + if err := syscall.Unshare(syscall.CLONE_NEWNET); err != nil { + exit(err) + } + + // background process to keep the net namespace alive + sleep := reexec.Command(InfraCmd, "sleep") + if err := sleep.Start(); err != nil { + exit(err) + } + pid := sleep.Process.Pid + stringPid := strconv.Itoa(pid) + + if err := ioutil.WriteFile(pidfile, []byte(stringPid), 0700); err != nil { + exit(errors.Wrap(err, "failed to write pid file")) + } + + // set the loopback adapter up + lo, err := netlink.LinkByName("lo") + if err != nil { + exit(errors.Wrap(err, "failed to get the loopback adapter")) + } + if err = netlink.LinkSetUp(lo); err != nil { + exit(errors.Wrap(err, "failed to set the loopback adapter up")) + } + }) +} + +// exit with ec 0 if error is nil otherwise exit with ec 1 and log the error to stderr +func exit(err error) { + if err != nil { + printErrorf("%s", err) + os.Exit(1) + } + os.Exit(0) +} + +func dirIsEmpty(name string) (bool, error) { + f, err := os.Open(name) + if err != nil { + return false, err + } + defer f.Close() + + names, err := f.Readdirnames(1) + // Readdirnames returns EOF error if it is empty + if len(names) == 0 && err == io.EOF { + return true, nil + } + return false, err +} + +// readConfigFromStdin reads the config from stdin +func readConfigFromStdin() (*Config, error) { + var config Config + stat, err := os.Stdin.Stat() + if err != nil { + return nil, errors.Wrapf(err, "unable to read from stdin") + } + if stat.Mode()&os.ModeNamedPipe == 0 { + return nil, errors.New("nothing to read from stdin") + } + b, err := ioutil.ReadAll(os.Stdin) + if err != nil { + return nil, err + } + err = json.Unmarshal(b, &config) + if err != nil { + return nil, errors.Wrap(err, "failed to read RootlessCNIConfig json") + } + return &config, nil +} + +func getNetNamespacePath(pidfile string) (string, error) { + b, err := ioutil.ReadFile(pidfile) + if err != nil { + return "", errors.Wrap(err, "failed to read pid file") + } + pid := string(b) + return path.Join("/proc", pid, "ns", "net"), err +} + +func createNetNamespace(pidfile string) (string, error) { + rcmd := reexec.Command(infraCreateNetNSCmd, pidfile) + rcmd.Stderr = os.Stderr + rcmd.Stdout = os.Stdout + if err := rcmd.Run(); err != nil { + return "", errors.Wrap(err, "failed to create network namespace") + } + return getNetNamespacePath(pidfile) +} + +func createCNIconfigs(cfg *Config) (*libcni.CNIConfig, *libcni.NetworkConfigList, *libcni.RuntimeConf) { + args := [][2]string{ + {"IgnoreUnknown", "1"}, + {"K8S_POD_NAME", cfg.CNIPodName}, + } + // add static ip if given + if cfg.IP != "" { + args = append(args, [2]string{"IP", cfg.IP}) + } + // add static mac if given + if cfg.MAC != "" { + args = append(args, [2]string{"MAC", cfg.MAC}) + } + + // add aliases + capabilityArgs := make(map[string]interface{}) + if len(cfg.Aliases) > 0 { + capabilityArgs["aliases"] = cfg.Aliases + } + + rt := &libcni.RuntimeConf{ + ContainerID: cfg.ID, + IfName: cfg.InterfaceName, + Args: args, + CapabilityArgs: capabilityArgs, + } + + netconf, err := libcni.LoadConfList(cfg.NetConfPath, cfg.Network) + if err != nil { + cleanupErr := cleanupFiles(getPaths(cfg.ID, cfg.Network)) + printErrorf("%v", cleanupErr) + exit(err) + } + + cninet := libcni.NewCNIConfig(cfg.PluginPaths, nil) + + return cninet, netconf, rt +} + +func getPaths(cid, net string) (string, string) { + base := path.Join(basePath, cid) + pidfile := path.Join(base, "pid") + netfile := path.Join(base, "networks", net) + return pidfile, netfile +} + +func alloc() { + conf, err := readConfigFromStdin() + if err != nil { + exit(err) + } + pidfile, netfile := getPaths(conf.ID, conf.Network) + ns, err := getNetNamespacePath(pidfile) + if err != nil && !os.IsNotExist(errors.Cause(err)) { + exit(err) + } + // if namespace path does not exists create new namespace + if os.IsNotExist(errors.Cause(err)) { + ns, err = createNetNamespace(pidfile) + if err != nil { + exit(err) + } + } + + if err := os.MkdirAll(path.Dir(netfile), 0700); err != nil { + exit(err) + } + // create a file to keep track of the attached networks + _, err = os.Create(netfile) + if err != nil { + exit(err) + } + + // prepare the cni configs + cninet, netconf, rt := createCNIconfigs(conf) + rt.NetNS = ns + + // call cni to add the network + res, err := cninet.AddNetworkList(context.TODO(), netconf, rt) + if err != nil { + // cleanup to make sure we don't have dangling files + // this is important to detect is-idle correctly + cleanupErr := cleanupFiles(pidfile, netfile) + if cleanupErr != nil { + printErrorf("%v", cleanupErr) + } + exit(errors.Wrapf(err, "failed to attach to cni network %s", conf.Network)) + } + // print res to stdout + res.Print() +} + +func dealloc() { + conf, err := readConfigFromStdin() + if err != nil { + exit(err) + } + pidfile, netfile := getPaths(conf.ID, conf.Network) + ns, err := getNetNamespacePath(pidfile) + if err != nil && !os.IsNotExist(err) { + exit(err) + } + if os.IsNotExist(err) { + // if the file does not exists the namespace is probably already deleted + // exit without error + exit(nil) + } + + // prepare the cni configs + cninet, netconf, rt := createCNIconfigs(conf) + rt.NetNS = ns + + // call cni to remove the network + err = cninet.DelNetworkList(context.TODO(), netconf, rt) + if err != nil { + exit(errors.Wrapf(err, "failed to detach cni network %s", conf.Network)) + } + + err = cleanupFiles(pidfile, netfile) + if err != nil { + exit(err) + } + + // print empty json result + // we have no information to return + fmt.Println("{}") +} + +func cleanupFiles(pidfile, netfile string) error { + // remove the config file + err := os.Remove(netfile) + if err != nil && !os.IsNotExist(err) { + return err + } + + // check if the config directory is empty + empty, err := dirIsEmpty(path.Dir(netfile)) + if err != nil && !os.IsNotExist(err) { + return err + } + if empty { + // if it is empty no more networks are attached to this container + // therefore kill the net namespace + var piderr error + b, err := ioutil.ReadFile(pidfile) + if err == nil { + pid, err := strconv.Atoi(string(b)) + if err == nil { + // kill the pause process which keeps the net ns alive + err = syscall.Kill(pid, syscall.SIGKILL) + if err != nil { + piderr = errors.Wrap(err, "ailed to kill the pause process") + } + } else { + piderr = errors.Wrap(err, "failed to parse the pid") + } + } else { + piderr = errors.Wrap(err, "failed to read the pid file") + } + // remove all remaining configuration files for this container + // always remove even if the pidfile parsing failed to ensure we do not have dangling files + err = os.RemoveAll(path.Dir(pidfile)) + if err != nil { + return err + } + return piderr + } + return nil +} diff --git a/test/system/500-networking.bats b/test/system/500-networking.bats index 0d976a6af5..121149f835 100644 --- a/test/system/500-networking.bats +++ b/test/system/500-networking.bats @@ -144,13 +144,6 @@ load helpers run_podman network rm $mynetname run_podman 1 network rm $mynetname - - # rootless CNI leaves behind an image pulled by SHA, hence with no tag. - # Remove it if present; we can only remove it by ID. - run_podman images --format '{{.Id}}' rootless-cni-infra - if [ -n "$output" ]; then - run_podman rmi $output - fi } @test "podman network reload" {