From 0e8ae5997e810a590c113457f4eec22dc7c56b19 Mon Sep 17 00:00:00 2001 From: Thomas Ferrandiz Date: Mon, 12 Feb 2024 15:23:23 +0000 Subject: [PATCH] Implement nftables masquerading for flannel This PR allows flannel to use nftables natively instead of iptables. This is used essentially to masquerade traffic coming from the pods. The PR also fixes the clean-up mechanism in the iptables implementation. --- Documentation/configuration.md | 22 ++ Documentation/kube-flannel.yml | 1 + Makefile | 2 +- e2e/Dockerfile | 1 + e2e/run-e2e-tests.sh | 123 ++++++- go.mod | 1 + go.sum | 4 + images/Dockerfile | 2 +- main.go | 27 +- pkg/subnet/config.go | 29 +- pkg/trafficmngr/iptables/iptables.go | 141 ++++++--- pkg/trafficmngr/iptables/iptables_restore.go | 8 + pkg/trafficmngr/iptables/iptables_windows.go | 15 +- pkg/trafficmngr/nftables/nftables.go | 317 +++++++++++++++++++ pkg/trafficmngr/nftables/nftables_windows.go | 48 +++ pkg/trafficmngr/nftables/utils.go | 58 ++++ pkg/trafficmngr/trafficmngr.go | 16 +- 17 files changed, 744 insertions(+), 71 deletions(-) create mode 100644 pkg/trafficmngr/nftables/nftables.go create mode 100644 pkg/trafficmngr/nftables/nftables_windows.go create mode 100644 pkg/trafficmngr/nftables/utils.go diff --git a/Documentation/configuration.md b/Documentation/configuration.md index a5ea6637f0..2a59019640 100644 --- a/Documentation/configuration.md +++ b/Documentation/configuration.md @@ -19,6 +19,9 @@ The value of the config is a JSON dictionary with the following keys: * `EnableIPv6` (bool): Enables ipv6 support Defaults to `false` +* `EnableNFTables` (bool): (EXPERIMENTAL) If set to true, flannel uses nftables instead of iptables to masquerade the traffic. + Default to `false` + * `SubnetLen` (integer): The size of the subnet allocated to each host. Defaults to 24 (i.e. /24) unless `Network` was configured to be smaller than a /22 in which case it is two less than the network. @@ -128,3 +131,22 @@ FLANNEL_IPMASQ=true ## IPv6 only To use an IPv6-only environment use the same configuration of the Dual-stack section to enable IPv6 and add "EnableIPv4": false in the net-conf.json of the kube-flannel-cfg ConfigMap. In case of IPv6-only setup, please use the docker.io IPv6-only endpoint as described in the following link: https://www.docker.com/blog/beta-ipv6-support-on-docker-hub-registry/ + +## nftables mode +To enable `nftables` mode in flannel, set `EnableNFTables` to true in flannel configuration. + +Note: to test with kube-proxy, use kubeadm with the following configuration: +```yaml +apiVersion: kubeadm.k8s.io/v1beta3 +kind: ClusterConfiguration +kubernetesVersion: v1.29.0 +controllerManager: + extraArgs: + feature-gates: NFTablesProxyMode=true +--- +apiVersion: kubeproxy.config.k8s.io/v1alpha1 +kind: KubeProxyConfiguration +mode: "nftables" +featureGates: + NFTablesProxyMode: true +``` diff --git a/Documentation/kube-flannel.yml b/Documentation/kube-flannel.yml index e3a9c816bd..be0266e28c 100644 --- a/Documentation/kube-flannel.yml +++ b/Documentation/kube-flannel.yml @@ -98,6 +98,7 @@ data: net-conf.json: | { "Network": "10.244.0.0/16", + "EnableNFTables": false, "Backend": { "Type": "vxlan" } diff --git a/Makefile b/Makefile index eaf698a58a..6ef7823fcf 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ QEMU_VERSION=v3.0.0 BASH_UNIT_VERSION=v2.3.0 # Default tag and architecture. Can be overridden -TAG?=$(shell git describe --tags --always) +TAG?=$(shell git describe --tags --dirty --always) ARCH?=amd64 # Only enable CGO (and build the UDP backend) on AMD64 ifeq ($(ARCH),amd64) diff --git a/e2e/Dockerfile b/e2e/Dockerfile index 41f6bf6916..09cf9bba6c 100644 --- a/e2e/Dockerfile +++ b/e2e/Dockerfile @@ -11,6 +11,7 @@ RUN set -x \ curl \ tar gzip\ iptables \ + nftables \ iproute2 \ iputils \ && if [ "${ARCH?required}" != "amd64" ]; then \ diff --git a/e2e/run-e2e-tests.sh b/e2e/run-e2e-tests.sh index d23dfa3bbe..bb4a51eb50 100644 --- a/e2e/run-e2e-tests.sh +++ b/e2e/run-e2e-tests.sh @@ -38,11 +38,12 @@ EOF write-flannel-conf(){ local backend=$1 + local enable_nftables=$2 cp ../Documentation/kube-flannel.yml ./kube-flannel.yml yq -i 'select(.kind == "DaemonSet").spec.template.spec.containers[0].image |= strenv(FLANNEL_IMAGE)' ./kube-flannel.yml yq -i 'select(.kind == "DaemonSet").spec.template.spec.initContainers[1].image |= strenv(FLANNEL_IMAGE)' ./kube-flannel.yml - export flannel_conf="{ \"Network\": \"$FLANNEL_NET\", \"Backend\": { \"Type\": \"${backend}\" } }" + export flannel_conf="{ \"Network\": \"$FLANNEL_NET\", \"Backend\": { \"Type\": \"${backend}\" }, \"EnableNFTables\": ${enable_nftables} }" yq -i 'select(.metadata.name == "kube-flannel-cfg").data."net-conf.json" |= strenv(flannel_conf)' ./kube-flannel.yml @@ -55,10 +56,11 @@ write-flannel-conf(){ # This is not used at the moment since github runners don't support dual-stack networking write-flannel-conf-dual-stack(){ local backend=$1 + local enable_nftables=$2 cp ../Documentation/kube-flannel.yml ./kube-flannel.yml yq -i 'select(.kind == "DaemonSet").spec.template.spec.containers[0].image |= strenv(FLANNEL_IMAGE)' ./kube-flannel.yml - export flannel_conf="{ \"EnableIPv6\": true, \"Network\": \"$FLANNEL_NET\", \"IPv6Network\":\"${FLANNEL_IP6NET}\", \"Backend\": { \"Type\": \"${backend}\" } }" + export flannel_conf="{ \"EnableIPv6\": true, \"Network\": \"$FLANNEL_NET\", \"IPv6Network\":\"${FLANNEL_IP6NET}\", \"Backend\": { \"Type\": \"${backend}\" }, \"EnableNFTables\": ${enable_nftables} }" yq -i 'select(.metadata.name == "kube-flannel-cfg").data."net-conf.json" |= strenv(flannel_conf)' ./kube-flannel.yml } @@ -67,6 +69,10 @@ install-flannel() { kubectl --kubeconfig="${HOME}/.kube/config" apply -f ./kube-flannel.yml } +delete-flannel() { + kubectl --kubeconfig="${HOME}/.kube/config" delete -f ./kube-flannel.yml +} + get_pod_ip() { local pod_name=$1 kubectl --kubeconfig="${HOME}/.kube/config" get pod ${pod_name} --template '{{.status.podIP}}' @@ -125,8 +131,9 @@ perf() { prepare_test() { local backend=$1 + local enable_nftables=${2:-false} # install flannel version to test - write-flannel-conf ${backend} + write-flannel-conf ${backend} ${enable_nftables} install-flannel # wait for nodes to be ready @@ -150,18 +157,32 @@ test_vxlan() { prepare_test vxlan pings check_iptables + delete-flannel + check_iptables_removed +} + +test_vxlan_nft() { + prepare_test vxlan true + pings + check_nftables + delete-flannel + check_nftables_removed } test_wireguard() { prepare_test wireguard pings check_iptables + delete-flannel + check_iptables_removed } test_host-gw() { prepare_test host-gw pings check_iptables + delete-flannel + check_iptables_removed } if [[ ${ARCH} == "amd64" ]]; then @@ -169,6 +190,8 @@ test_udp() { prepare_test udp pings check_iptables + delete-flannel + check_iptables_removed } fi @@ -176,6 +199,8 @@ test_ipip() { prepare_test ipip pings check_iptables + delete-flannel + check_iptables_removed } test_perf_vxlan() { @@ -260,3 +285,95 @@ $(docker exec --privileged local-worker /usr/sbin/iptables -t filter -S FLANNEL- "$(docker exec --privileged local-leader /usr/sbin/iptables -t filter -S FORWARD) $(docker exec --privileged local-leader /usr/sbin/iptables -t filter -S FLANNEL-FWD)" "Host 2 has not expected forward rules" } + +check_iptables_removed() { + local worker_podcidr=$(get_pod_cidr local-worker) + local leader_pod_cidr=$(get_pod_cidr local-leader) + read -r -d '' POSTROUTING_RULES_WORKER << EOM +-N FLANNEL-POSTRTG +EOM + read -r -d '' POSTROUTING_RULES_LEADER << EOM +-N FLANNEL-POSTRTG +EOM + read -r -d '' FORWARD_RULES << EOM +-P FORWARD ACCEPT +-A FORWARD -m conntrack --ctstate NEW -m comment --comment "kubernetes load balancer firewall" -j KUBE-PROXY-FIREWALL +-A FORWARD -m comment --comment "kubernetes forwarding rules" -j KUBE-FORWARD +-A FORWARD -m conntrack --ctstate NEW -m comment --comment "kubernetes service portals" -j KUBE-SERVICES +-A FORWARD -m conntrack --ctstate NEW -m comment --comment "kubernetes externally-visible service portals" -j KUBE-EXTERNAL-SERVICES +-N FLANNEL-FWD +EOM +# check that masquerade & forward rules have been removed + assert_equals "$POSTROUTING_RULES_WORKER" \ + "$(docker exec --privileged local-worker /usr/sbin/iptables -t nat -S POSTROUTING | grep FLANNEL)$(docker exec --privileged local-worker /usr/sbin/iptables -t nat -S FLANNEL-POSTRTG)" "Host 1 has not expected postrouting rules" + assert_equals "$POSTROUTING_RULES_LEADER" \ + "$(docker exec --privileged local-leader /usr/sbin/iptables -t nat -S POSTROUTING | grep FLANNEL)$(docker exec --privileged local-leader /usr/sbin/iptables -t nat -S FLANNEL-POSTRTG)" "Host 2 has not expected postrouting rules" + assert_equals "$FORWARD_RULES" \ + "$(docker exec --privileged local-worker /usr/sbin/iptables -t filter -S FORWARD) +$(docker exec --privileged local-worker /usr/sbin/iptables -t filter -S FLANNEL-FWD -w 5)" "Host 1 has not expected forward rules" + assert_equals "$FORWARD_RULES" \ + "$(docker exec --privileged local-leader /usr/sbin/iptables -t filter -S FORWARD) +$(docker exec --privileged local-leader /usr/sbin/iptables -t filter -S FLANNEL-FWD)" "Host 2 has not expected forward rules" +} + +###nftables +check_nftables() { + local worker_podcidr=$(get_pod_cidr local-worker) + local leader_podcidr=$(get_pod_cidr local-leader) + read -d '' POSTROUTING_RULES_WORKER << EOM +table ip flannel-ipv4 { + chain postrtg { + type nat hook postrouting priority srcnat; policy accept; + meta mark 0x00004000 return + ip saddr ${worker_podcidr} ip daddr 10.42.0.0/16 return + ip saddr 10.42.0.0/16 ip daddr ${worker_podcidr} return + ip saddr != ${worker_podcidr} ip daddr 10.42.0.0/16 return + ip saddr 10.42.0.0/16 ip daddr != 224.0.0.0/4 masquerade fully-random + ip saddr != 10.42.0.0/16 ip daddr 10.42.0.0/16 masquerade fully-random + } +} +EOM + read -r -d '' POSTROUTING_RULES_LEADER << EOM +table ip flannel-ipv4 { + chain postrtg { + type nat hook postrouting priority srcnat; policy accept; + meta mark 0x00004000 return + ip saddr ${leader_podcidr} ip daddr 10.42.0.0/16 return + ip saddr 10.42.0.0/16 ip daddr ${leader_podcidr} return + ip saddr != ${leader_podcidr} ip daddr 10.42.0.0/16 return + ip saddr 10.42.0.0/16 ip daddr != 224.0.0.0/4 masquerade fully-random + ip saddr != 10.42.0.0/16 ip daddr 10.42.0.0/16 masquerade fully-random + } +} +EOM + read -r -d '' FORWARD_RULES << EOM +table ip flannel-ipv4 { + chain forward { + type filter hook input priority filter; policy accept; + ip saddr 10.42.0.0/16 accept + ip daddr 10.42.0.0/16 accept + } +} +EOM + # check masquerade & forward rules + assert_equals "$POSTROUTING_RULES_WORKER" \ + "$(docker exec --privileged local-worker /usr/sbin/nft list chain flannel-ipv4 postrtg)" "Node worker does not have expected postrouting rules" + assert_equals "$POSTROUTING_RULES_LEADER" \ + "$(docker exec --privileged local-leader /usr/sbin/nft list chain flannel-ipv4 postrtg)" "Node leader does not have expected postrouting rules" + assert_equals "$FORWARD_RULES" \ + "$(docker exec --privileged local-worker /usr/sbin/nft list chain flannel-ipv4 forward)" "Node worker does not have expected forward rules" + assert_equals "$FORWARD_RULES" \ + "$(docker exec --privileged local-leader /usr/sbin/nft list chain flannel-ipv4 forward)" "Node leader does not have expected forward rules" +} + +check_nftables_removed() { + # check masquerade & forward rules + assert_equals "" \ + "$(docker exec --privileged local-worker /usr/sbin/nft list chain flannel-ipv4 postrtg)" "Node worker has unexpected postrouting rules" + assert_equals "" \ + "$(docker exec --privileged local-leader /usr/sbin/nft list chain flannel-ipv4 postrtg)" "Node leader has unexpected postrouting rules" + assert_equals "" \ + "$(docker exec --privileged local-worker /usr/sbin/nft list chain flannel-ipv4 forward)" "Node worker has unexpected forward rules" + assert_equals "" \ + "$(docker exec --privileged local-leader /usr/sbin/nft list chain flannel-ipv4 forward)" "Node leader has unexpected forward rules" +} \ No newline at end of file diff --git a/go.mod b/go.mod index 6d9205d63d..d41b5a0674 100644 --- a/go.mod +++ b/go.mod @@ -35,6 +35,7 @@ require ( github.com/avast/retry-go/v4 v4.5.1 github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/common v1.0.872 github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/vpc v1.0.872 + sigs.k8s.io/knftables v0.0.14 ) require ( diff --git a/go.sum b/go.sum index 520673ae92..8293d45f32 100644 --- a/go.sum +++ b/go.sum @@ -255,6 +255,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/lithammer/dedent v1.1.0 h1:VNzHMVCBNG1j0fh3OrsFRkVUwStdDArbgBWoPAffktY= +github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.7.6 h1:8yTIVnZgCoiM1TgqoeTl+LfU5Jg6/xL3QhGQnimLYnA= @@ -781,6 +783,8 @@ rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k= sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/knftables v0.0.14 h1:VzKQoDMCGBOH8c85sGrWSXSPCS0XrIpEfOlcCLBXiC0= +sigs.k8s.io/knftables v0.0.14/go.mod h1:f/5ZLKYEUPUhVjUCg6l80ACdL7CIIyeL0DxfgojGRTk= sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= diff --git a/images/Dockerfile b/images/Dockerfile index fd60008878..3c8d248120 100644 --- a/images/Dockerfile +++ b/images/Dockerfile @@ -27,7 +27,7 @@ RUN export GOOS=$(xx-info os) &&\ FROM alpine:20231219 RUN apk update && apk upgrade -RUN apk add --no-cache iproute2 ca-certificates iptables strongswan iptables-legacy && update-ca-certificates +RUN apk add --no-cache iproute2 ca-certificates nftables iptables strongswan iptables-legacy && update-ca-certificates RUN apk add wireguard-tools --no-cache --repository http://dl-cdn.alpinelinux.org/alpine/edge/community COPY --from=build /build/dist/flanneld /opt/bin/flanneld COPY dist/mk-docker-opts.sh /opt/bin/ diff --git a/main.go b/main.go index 94a95ba80e..702a2b476a 100644 --- a/main.go +++ b/main.go @@ -36,6 +36,7 @@ import ( "github.com/flannel-io/flannel/pkg/subnet/kube" "github.com/flannel-io/flannel/pkg/trafficmngr" "github.com/flannel-io/flannel/pkg/trafficmngr/iptables" + "github.com/flannel-io/flannel/pkg/trafficmngr/nftables" "github.com/flannel-io/flannel/pkg/version" "golang.org/x/net/context" log "k8s.io/klog/v2" @@ -336,7 +337,15 @@ func main() { } //Create TrafficManager and instanciate it based on whether we use iptables or nftables - trafficMngr := newTrafficManager() + trafficMngr := newTrafficManager(config.EnableNFTables) + err = trafficMngr.Init(ctx, &wg) + if err != nil { + log.Error(err) + cancel() + wg.Wait() + os.Exit(1) + } + flannelIPv4Net := ip.IP4Net{} flannelIpv6Net := ip.IP6Net{} if config.EnableIPv4 { @@ -365,7 +374,8 @@ func main() { prevIPv6Networks := ReadIP6CIDRsFromSubnetFile(opts.subnetFile, "FLANNEL_IPV6_NETWORK") prevIPv6Subnet := ReadIP6CIDRFromSubnetFile(opts.subnetFile, "FLANNEL_IPV6_SUBNET") - err = trafficMngr.SetupAndEnsureMasqRules(flannelIPv4Net, prevSubnet, + err = trafficMngr.SetupAndEnsureMasqRules(ctx, + flannelIPv4Net, prevSubnet, prevNetworks, flannelIpv6Net, prevIPv6Subnet, prevIPv6Networks, @@ -383,7 +393,7 @@ func main() { // In Docker 1.12 and earlier, the default FORWARD chain policy was ACCEPT. // In Docker 1.13 and later, Docker sets the default policy of the FORWARD chain to DROP. if opts.iptablesForwardRules { - trafficMngr.SetupAndEnsureForwardRules( + trafficMngr.SetupAndEnsureForwardRules(ctx, flannelIPv4Net, flannelIpv6Net, opts.iptablesResyncSeconds) @@ -569,6 +579,13 @@ func ReadIP6CIDRsFromSubnetFile(path string, CIDRKey string) []ip.IP6Net { return prevCIDRs } -func newTrafficManager() trafficmngr.TrafficManager { - return iptables.IPTablesManager{} +func newTrafficManager(useNftables bool) trafficmngr.TrafficManager { + if useNftables { + log.Info("Starting flannel in nftables mode") + return &nftables.NFTablesManager{} + } else { + log.Info("Starting flannel in iptables mode") + return &iptables.IPTablesManager{} + + } } diff --git a/pkg/subnet/config.go b/pkg/subnet/config.go index 5b8357cec4..b7a1afe270 100644 --- a/pkg/subnet/config.go +++ b/pkg/subnet/config.go @@ -27,20 +27,21 @@ import ( ) type Config struct { - EnableIPv4 bool - EnableIPv6 bool - Network ip.IP4Net - IPv6Network ip.IP6Net - Networks []ip.IP4Net - IPv6Networks []ip.IP6Net - SubnetMin ip.IP4 - SubnetMax ip.IP4 - IPv6SubnetMin *ip.IP6 - IPv6SubnetMax *ip.IP6 - SubnetLen uint - IPv6SubnetLen uint - BackendType string `json:"-"` - Backend json.RawMessage `json:",omitempty"` + EnableIPv4 bool + EnableIPv6 bool + EnableNFTables bool + Network ip.IP4Net + IPv6Network ip.IP6Net + Networks []ip.IP4Net + IPv6Networks []ip.IP6Net + SubnetMin ip.IP4 + SubnetMax ip.IP4 + IPv6SubnetMin *ip.IP6 + IPv6SubnetMax *ip.IP6 + SubnetLen uint + IPv6SubnetLen uint + BackendType string `json:"-"` + Backend json.RawMessage `json:",omitempty"` } func parseBackendType(be json.RawMessage) (string, error) { diff --git a/pkg/trafficmngr/iptables/iptables.go b/pkg/trafficmngr/iptables/iptables.go index d2a9dd4bea..8126940d4d 100644 --- a/pkg/trafficmngr/iptables/iptables.go +++ b/pkg/trafficmngr/iptables/iptables.go @@ -17,7 +17,9 @@ package iptables import ( + "context" "fmt" + "sync" "time" "github.com/coreos/go-iptables/iptables" @@ -40,11 +42,67 @@ type IPTablesError interface { Error() string } -type IPTablesManager struct{} +type IPTablesManager struct { + ipv4Rules []trafficmngr.IPTablesRule + ipv6Rules []trafficmngr.IPTablesRule +} + +func (iptm *IPTablesManager) Init(ctx context.Context, wg *sync.WaitGroup) error { + iptm.ipv4Rules = make([]trafficmngr.IPTablesRule, 0, 10) + iptm.ipv6Rules = make([]trafficmngr.IPTablesRule, 0, 10) + wg.Add(1) + go func() { + <-ctx.Done() + time.Sleep(time.Second) + err := iptm.cleanUp() + if err != nil { + log.Errorf("iptables: error while cleaning-up: %v", err) + } + wg.Done() + }() -const kubeProxyMark string = "0x4000/0x4000" + return nil +} + +func (iptm *IPTablesManager) cleanUp() error { + if len(iptm.ipv4Rules) > 0 { + ipt, err := iptables.New() + if err != nil { + // if we can't find iptables, give up and return + return fmt.Errorf("failed to setup IPTables. iptables binary was not found: %v", err) + } + iptRestore, err := NewIPTablesRestoreWithProtocol(iptables.ProtocolIPv4) + if err != nil { + // if we can't find iptables-restore, give up and return + return fmt.Errorf("failed to setup IPTables. iptables-restore binary was not found: %v", err) + } + log.Info("iptables (ipv4): cleaning-up before exiting flannel...") + err = teardownIPTables(ipt, iptRestore, iptm.ipv4Rules) + if err != nil { + log.Errorf("Failed to tear down IPTables: %v", err) + } + } + if len(iptm.ipv6Rules) > 0 { + ipt, err := iptables.NewWithProtocol(iptables.ProtocolIPv6) + if err != nil { + // if we can't find iptables, give up and return + return fmt.Errorf("failed to setup IPTables. iptables binary was not found: %v", err) + } + iptRestore, err := NewIPTablesRestoreWithProtocol(iptables.ProtocolIPv6) + if err != nil { + // if we can't find iptables-restore, give up and return + return fmt.Errorf("failed to setup IPTables. iptables-restore binary was not found: %v", err) + } + log.Info("iptables (ipv6): cleaning-up before exiting flannel...") + err = teardownIPTables(ipt, iptRestore, iptm.ipv6Rules) + if err != nil { + log.Errorf("Failed to tear down IPTables: %v", err) + } + } + return nil +} -func (iptm IPTablesManager) SetupAndEnsureMasqRules(flannelIPv4Net, prevSubnet ip.IP4Net, +func (iptm *IPTablesManager) SetupAndEnsureMasqRules(ctx context.Context, flannelIPv4Net, prevSubnet ip.IP4Net, prevNetworks []ip.IP4Net, flannelIPv6Net, prevIPv6Subnet ip.IP6Net, prevIPv6Networks []ip.IP6Net, @@ -78,7 +136,7 @@ func (iptm IPTablesManager) SetupAndEnsureMasqRules(flannelIPv4Net, prevSubnet i getRules := func() []trafficmngr.IPTablesRule { return iptm.masqRules([]ip.IP4Net{flannelIPv4Net}, currentlease) } - go iptm.setupAndEnsureIP4Tables(getRules, resyncPeriod) + go iptm.setupAndEnsureIP4Tables(ctx, getRules, resyncPeriod) } if !flannelIPv6Net.Empty() { //Find the cidr in FLANNEL_IPV6_NETWORK which contains the podCIDR (i.e. FLANNEL_IPV6_SUBNET) of this node @@ -107,12 +165,12 @@ func (iptm IPTablesManager) SetupAndEnsureMasqRules(flannelIPv4Net, prevSubnet i getRules := func() []trafficmngr.IPTablesRule { return iptm.masqIP6Rules([]ip.IP6Net{flannelIPv6Net}, currentlease) } - go iptm.setupAndEnsureIP6Tables(getRules, resyncPeriod) + go iptm.setupAndEnsureIP6Tables(ctx, getRules, resyncPeriod) } return nil } -func (iptm IPTablesManager) masqRules(cluster_cidrs []ip.IP4Net, lease *lease.Lease) []trafficmngr.IPTablesRule { +func (iptm *IPTablesManager) masqRules(cluster_cidrs []ip.IP4Net, lease *lease.Lease) []trafficmngr.IPTablesRule { pod_cidr := lease.Subnet.String() ipt, err := iptables.New() supports_random_fully := false @@ -123,7 +181,7 @@ func (iptm IPTablesManager) masqRules(cluster_cidrs []ip.IP4Net, lease *lease.Le // This rule ensure that the flannel iptables rules are executed before other rules on the node rules[0] = trafficmngr.IPTablesRule{Table: "nat", Action: "-A", Chain: "POSTROUTING", Rulespec: []string{"-m", "comment", "--comment", "flanneld masq", "-j", "FLANNEL-POSTRTG"}} // This rule will not masquerade traffic marked by the kube-proxy to avoid double NAT bug on some kernel version - rules[1] = trafficmngr.IPTablesRule{Table: "nat", Action: "-A", Chain: "FLANNEL-POSTRTG", Rulespec: []string{"-m", "mark", "--mark", kubeProxyMark, "-m", "comment", "--comment", "flanneld masq", "-j", "RETURN"}} + rules[1] = trafficmngr.IPTablesRule{Table: "nat", Action: "-A", Chain: "FLANNEL-POSTRTG", Rulespec: []string{"-m", "mark", "--mark", trafficmngr.KubeProxyMark, "-m", "comment", "--comment", "flanneld masq", "-j", "RETURN"}} for _, ccidr := range cluster_cidrs { cluster_cidr := ccidr.String() // This rule makes sure we don't NAT traffic within overlay network (e.g. coming out of docker0), for any of the cluster_cidrs @@ -158,7 +216,7 @@ func (iptm IPTablesManager) masqRules(cluster_cidrs []ip.IP4Net, lease *lease.Le return rules } -func (iptm IPTablesManager) masqIP6Rules(cluster_cidrs []ip.IP6Net, lease *lease.Lease) []trafficmngr.IPTablesRule { +func (iptm *IPTablesManager) masqIP6Rules(cluster_cidrs []ip.IP6Net, lease *lease.Lease) []trafficmngr.IPTablesRule { pod_cidr := lease.IPv6Subnet.String() ipt, err := iptables.NewWithProtocol(iptables.ProtocolIPv6) supports_random_fully := false @@ -170,7 +228,7 @@ func (iptm IPTablesManager) masqIP6Rules(cluster_cidrs []ip.IP6Net, lease *lease // This rule ensure that the flannel iptables rules are executed before other rules on the node rules[0] = trafficmngr.IPTablesRule{Table: "nat", Action: "-A", Chain: "POSTROUTING", Rulespec: []string{"-m", "comment", "--comment", "flanneld masq", "-j", "FLANNEL-POSTRTG"}} // This rule will not masquerade traffic marked by the kube-proxy to avoid double NAT bug on some kernel version - rules[1] = trafficmngr.IPTablesRule{Table: "nat", Action: "-A", Chain: "FLANNEL-POSTRTG", Rulespec: []string{"-m", "mark", "--mark", kubeProxyMark, "-m", "comment", "--comment", "flanneld masq", "-j", "RETURN"}} + rules[1] = trafficmngr.IPTablesRule{Table: "nat", Action: "-A", Chain: "FLANNEL-POSTRTG", Rulespec: []string{"-m", "mark", "--mark", trafficmngr.KubeProxyMark, "-m", "comment", "--comment", "flanneld masq", "-j", "RETURN"}} for _, ccidr := range cluster_cidrs { cluster_cidr := ccidr.String() @@ -209,14 +267,14 @@ func (iptm IPTablesManager) masqIP6Rules(cluster_cidrs []ip.IP6Net, lease *lease return rules } -func (iptm IPTablesManager) SetupAndEnsureForwardRules(flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) { +func (iptm *IPTablesManager) SetupAndEnsureForwardRules(ctx context.Context, flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) { if !flannelIPv4Network.Empty() { log.Infof("Changing default FORWARD chain policy to ACCEPT") iptm.CreateIP4Chain("filter", "FLANNEL-FWD") getRules := func() []trafficmngr.IPTablesRule { return iptm.forwardRules(flannelIPv4Network.String()) } - go iptm.setupAndEnsureIP4Tables(getRules, resyncPeriod) + go iptm.setupAndEnsureIP4Tables(ctx, getRules, resyncPeriod) } if !flannelIPv6Network.Empty() { log.Infof("IPv6: Changing default FORWARD chain policy to ACCEPT") @@ -224,11 +282,11 @@ func (iptm IPTablesManager) SetupAndEnsureForwardRules(flannelIPv4Network ip.IP4 getRules := func() []trafficmngr.IPTablesRule { return iptm.forwardRules(flannelIPv6Network.String()) } - go iptm.setupAndEnsureIP6Tables(getRules, resyncPeriod) + go iptm.setupAndEnsureIP6Tables(ctx, getRules, resyncPeriod) } } -func (iptm IPTablesManager) forwardRules(flannelNetwork string) []trafficmngr.IPTablesRule { +func (iptm *IPTablesManager) forwardRules(flannelNetwork string) []trafficmngr.IPTablesRule { return []trafficmngr.IPTablesRule{ // This rule ensure that the flannel iptables rules are executed before other rules on the node {Table: "filter", Action: "-A", Chain: "FORWARD", Rulespec: []string{"-m", "comment", "--comment", "flanneld forward", "-j", "FLANNEL-FWD"}}, @@ -238,7 +296,7 @@ func (iptm IPTablesManager) forwardRules(flannelNetwork string) []trafficmngr.IP } } -func (iptm IPTablesManager) CreateIP4Chain(table, chain string) { +func (iptm *IPTablesManager) CreateIP4Chain(table, chain string) { ipt, err := iptables.New() if err != nil { // if we can't find iptables, give up and return @@ -253,7 +311,7 @@ func (iptm IPTablesManager) CreateIP4Chain(table, chain string) { } } -func (iptm IPTablesManager) CreateIP6Chain(table, chain string) { +func (iptm *IPTablesManager) CreateIP6Chain(table, chain string) { ipt, err := iptables.NewWithProtocol(iptables.ProtocolIPv6) if err != nil { // if we can't find iptables, give up and return @@ -368,9 +426,8 @@ func ipTablesBootstrap(ipt IPTables, iptRestore IPTablesRestore, rules []traffic return nil } -func (iptm IPTablesManager) setupAndEnsureIP4Tables(getRules func() []trafficmngr.IPTablesRule, resyncPeriod int) { +func (iptm *IPTablesManager) setupAndEnsureIP4Tables(ctx context.Context, getRules func() []trafficmngr.IPTablesRule, resyncPeriod int) { rules := getRules() - log.Infof("generated %d rules", len(rules)) ipt, err := iptables.New() if err != nil { // if we can't find iptables, give up and return @@ -390,24 +447,23 @@ func (iptm IPTablesManager) setupAndEnsureIP4Tables(getRules func() []trafficmng log.Errorf("Failed to bootstrap IPTables: %v", err) } - defer func() { - err := teardownIPTables(ipt, iptRestore, rules) - if err != nil { - log.Errorf("Failed to tear down IPTables: %v", err) - } - }() - + iptm.ipv4Rules = append(iptm.ipv4Rules, rules...) for { - // Ensure that all the iptables rules exist every 5 seconds - if err := ensureIPTables(ipt, iptRestore, getRules()); err != nil { - log.Errorf("Failed to ensure iptables rules: %v", err) + select { + case <-ctx.Done(): + //clean-up is setup in Init + return + case <-time.After(time.Duration(resyncPeriod) * time.Second): + // Ensure that all the iptables rules exist every 5 seconds + if err := ensureIPTables(ipt, iptRestore, rules); err != nil { + log.Errorf("Failed to ensure iptables rules: %v", err) + } } - time.Sleep(time.Duration(resyncPeriod) * time.Second) } } -func (iptm IPTablesManager) setupAndEnsureIP6Tables(getRules func() []trafficmngr.IPTablesRule, resyncPeriod int) { +func (iptm *IPTablesManager) setupAndEnsureIP6Tables(ctx context.Context, getRules func() []trafficmngr.IPTablesRule, resyncPeriod int) { rules := getRules() ipt, err := iptables.NewWithProtocol(iptables.ProtocolIPv6) if err != nil { @@ -427,26 +483,24 @@ func (iptm IPTablesManager) setupAndEnsureIP6Tables(getRules func() []trafficmng // if we can't find iptables, give up and return log.Errorf("Failed to bootstrap IPTables: %v", err) } - - defer func() { - err := teardownIPTables(ipt, iptRestore, rules) - if err != nil { - log.Errorf("Failed to tear down IPTables: %v", err) - } - }() + iptm.ipv6Rules = append(iptm.ipv6Rules, rules...) for { - // Ensure that all the iptables rules exist every 5 seconds - if err := ensureIPTables(ipt, iptRestore, getRules()); err != nil { - log.Errorf("Failed to ensure iptables rules: %v", err) + select { + case <-ctx.Done(): + //clean-up is setup in Init + return + case <-time.After(time.Duration(resyncPeriod) * time.Second): + // Ensure that all the iptables rules exist every 5 seconds + if err := ensureIPTables(ipt, iptRestore, getRules()); err != nil { + log.Errorf("Failed to ensure iptables rules: %v", err) + } } - - time.Sleep(time.Duration(resyncPeriod) * time.Second) } } // deleteIP4Tables delete specified iptables rules -func (iptm IPTablesManager) deleteIP4Tables(rules []trafficmngr.IPTablesRule) error { +func (iptm *IPTablesManager) deleteIP4Tables(rules []trafficmngr.IPTablesRule) error { ipt, err := iptables.New() if err != nil { // if we can't find iptables, give up and return @@ -468,7 +522,7 @@ func (iptm IPTablesManager) deleteIP4Tables(rules []trafficmngr.IPTablesRule) er } // deleteIP6Tables delete specified iptables rules -func (iptm IPTablesManager) deleteIP6Tables(rules []trafficmngr.IPTablesRule) error { +func (iptm *IPTablesManager) deleteIP6Tables(rules []trafficmngr.IPTablesRule) error { ipt, err := iptables.NewWithProtocol(iptables.ProtocolIPv6) if err != nil { // if we can't find iptables, give up and return @@ -539,6 +593,7 @@ func teardownIPTables(ipt IPTables, iptr IPTablesRestore, rules []trafficmngr.IP // this shouldn't ever happen return fmt.Errorf("failed to check rule existence: %v", err) } + if exists { if _, ok := tablesRules[rule.Table]; !ok { tablesRules[rule.Table] = []IPTablesRestoreRuleSpec{} diff --git a/pkg/trafficmngr/iptables/iptables_restore.go b/pkg/trafficmngr/iptables/iptables_restore.go index 75973a6212..aa6e42a587 100644 --- a/pkg/trafficmngr/iptables/iptables_restore.go +++ b/pkg/trafficmngr/iptables/iptables_restore.go @@ -23,6 +23,7 @@ import ( "os/exec" "regexp" "strconv" + "sync" "github.com/coreos/go-iptables/iptables" log "k8s.io/klog/v2" @@ -46,6 +47,10 @@ type ipTablesRestore struct { path string proto iptables.Protocol hasWait bool + // ipTablesRestore needs a mutex to ensure that two avoid + // collisions between two goroutines calling ApplyWithoutFlush in parallel. + // This could result in the second call accidentally restoring a rule removed by the first + mu sync.Mutex } // IPTablesRestoreRules represents iptables-restore table block @@ -75,12 +80,15 @@ func NewIPTablesRestoreWithProtocol(protocol iptables.Protocol) (IPTablesRestore path: path, proto: protocol, hasWait: hasWait, + mu: sync.Mutex{}, } return &ipt, nil } // ApplyWithoutFlush apply without flush chains func (iptr *ipTablesRestore) ApplyWithoutFlush(rules IPTablesRestoreRules) error { + iptr.mu.Lock() + defer iptr.mu.Unlock() payload := buildIPTablesRestorePayload(rules) log.V(6).Infof("trying to run with payload %s", payload) diff --git a/pkg/trafficmngr/iptables/iptables_windows.go b/pkg/trafficmngr/iptables/iptables_windows.go index c63900b4b9..4d55415a06 100644 --- a/pkg/trafficmngr/iptables/iptables_windows.go +++ b/pkg/trafficmngr/iptables/iptables_windows.go @@ -15,8 +15,14 @@ package iptables import ( + "context" + "sync" + + log "k8s.io/klog/v2" + "github.com/flannel-io/flannel/pkg/ip" "github.com/flannel-io/flannel/pkg/lease" + "github.com/flannel-io/flannel/pkg/trafficmngr" ) type IPTablesManager struct{} @@ -29,14 +35,19 @@ type IPTables interface { Exists(table string, chain string, rulespec ...string) (bool, error) } -func (iptm IPTablesManager) SetupAndEnsureForwardRules(flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) { +func (iptm IPTablesManager) Init(ctx context.Context, wg *sync.WaitGroup) error { + return trafficmngr.ErrUnimplemented +} + +func (iptm *IPTablesManager) SetupAndEnsureForwardRules(ctx context.Context, flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) { } -func (iptm IPTablesManager) SetupAndEnsureMasqRules(flannelIPv4Net, prevSubnet ip.IP4Net, +func (iptm *IPTablesManager) SetupAndEnsureMasqRules(ctx context.Context, flannelIPv4Net, prevSubnet ip.IP4Net, prevNetworks []ip.IP4Net, flannelIPv6Net, prevIPv6Subnet ip.IP6Net, prevIPv6Networks []ip.IP6Net, currentlease *lease.Lease, resyncPeriod int) error { + log.Error(trafficmngr.ErrUnimplemented) return nil } diff --git a/pkg/trafficmngr/nftables/nftables.go b/pkg/trafficmngr/nftables/nftables.go new file mode 100644 index 0000000000..41ab505ca6 --- /dev/null +++ b/pkg/trafficmngr/nftables/nftables.go @@ -0,0 +1,317 @@ +// Copyright 2024 flannel authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//go:build !windows +// +build !windows + +package nftables + +import ( + "context" + "fmt" + "sync" + "time" + + log "k8s.io/klog/v2" + + "github.com/flannel-io/flannel/pkg/ip" + "github.com/flannel-io/flannel/pkg/lease" + "sigs.k8s.io/knftables" +) + +const ( + ipv4Table = "flannel-ipv4" + ipv6Table = "flannel-ipv6" + forwardChain = "forward" + postrtgChain = "postrtg" + //maximum delay in second to clean-up when the context is cancelled + cleanUpDeadline = 15 +) + +type NFTablesManager struct { + nftv4 knftables.Interface + nftv6 knftables.Interface +} + +func (nftm *NFTablesManager) Init(ctx context.Context, wg *sync.WaitGroup) error { + var err error + nftm.nftv4, err = initTable(ctx, knftables.IPv4Family, ipv4Table) + if err != nil { + return err + } + nftm.nftv6, err = initTable(ctx, knftables.IPv6Family, ipv6Table) + if err != nil { + return err + } + + wg.Add(1) + go func() { + <-ctx.Done() + log.Info("Cleaning-up flannel tables...") + + cleanupCtx, cleanUpCancelFunc := context.WithTimeout(context.Background(), cleanUpDeadline*time.Second) + defer cleanUpCancelFunc() + err := nftm.cleanUp(cleanupCtx) + log.Errorf("nftables: error while cleaning-up: %v", err) + wg.Done() + }() + return nil +} + +// create a new table and returns the interface to interact with it +func initTable(ctx context.Context, ipFamily knftables.Family, name string) (knftables.Interface, error) { + nft, err := knftables.New(ipFamily, name) + if err != nil { + return nil, fmt.Errorf("no nftables support: %v", err) + } + tx := nft.NewTransaction() + + tx.Add(&knftables.Table{ + Comment: knftables.PtrTo("rules for " + name), + }) + err = nft.Run(ctx, tx) + if err != nil { + return nil, fmt.Errorf("nftables: couldn't initialise table %s: %v", name, err) + } + return nft, nil +} + +// It is needed when using nftables? accept seems to be the default +// warning: never add a default 'drop' policy on the forwarChain as it breaks connectivity to the node +func (nftm *NFTablesManager) SetupAndEnsureForwardRules(ctx context.Context, + flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) { + if !flannelIPv4Network.Empty() { + log.Infof("Changing default FORWARD chain policy to ACCEPT") + tx := nftm.nftv4.NewTransaction() + + //TODO how to express that the default is drop? + tx.Add(&knftables.Chain{ + Name: forwardChain, + Comment: knftables.PtrTo("chain to accept flannel traffic"), + Type: knftables.PtrTo(knftables.FilterType), + Hook: knftables.PtrTo(knftables.InputHook), + Priority: knftables.PtrTo(knftables.FilterPriority), + }) + tx.Flush(&knftables.Chain{ + Name: forwardChain, + }) + + tx.Add(&knftables.Rule{ + Chain: forwardChain, + Rule: knftables.Concat( + "ip saddr", flannelIPv4Network.String(), + "accept", + ), + }) + tx.Add(&knftables.Rule{ + Chain: forwardChain, + Rule: knftables.Concat( + "ip daddr", flannelIPv4Network.String(), + "accept", + ), + }) + err := nftm.nftv4.Run(ctx, tx) + if err != nil { + log.Errorf("nftables: couldn't setup forward rules: %v", err) + } + } + if !flannelIPv6Network.Empty() { + log.Infof("Changing default FORWARD chain policy to ACCEPT (ipv6)") + tx := nftm.nftv6.NewTransaction() + + //TODO how to express that the default is drop? + tx.Add(&knftables.Chain{ + Name: forwardChain, + Comment: knftables.PtrTo("chain to accept flannel traffic"), + Type: knftables.PtrTo(knftables.FilterType), + Hook: knftables.PtrTo(knftables.InputHook), + Priority: knftables.PtrTo(knftables.FilterPriority), + }) + tx.Flush(&knftables.Chain{ + Name: forwardChain, + }) + + tx.Add(&knftables.Rule{ + Chain: forwardChain, + Rule: knftables.Concat( + "ip6 saddr", flannelIPv6Network.String(), + "accept", + ), + }) + tx.Add(&knftables.Rule{ + Chain: forwardChain, + Rule: knftables.Concat( + "ip6 daddr", flannelIPv6Network.String(), + "accept", + ), + }) + err := nftm.nftv6.Run(ctx, tx) + if err != nil { + log.Errorf("nftables: couldn't setup forward rules (ipv6): %v", err) + } + } +} + +func (nftm *NFTablesManager) SetupAndEnsureMasqRules(ctx context.Context, flannelIPv4Net, prevSubnet ip.IP4Net, + prevNetworks []ip.IP4Net, + flannelIPv6Net, prevIPv6Subnet ip.IP6Net, + prevIPv6Networks []ip.IP6Net, + currentlease *lease.Lease, + resyncPeriod int) error { + if !flannelIPv4Net.Empty() { + log.Infof("nftables: setting up masking rules (ipv4)") + tx := nftm.nftv4.NewTransaction() + + tx.Add(&knftables.Chain{ + Name: postrtgChain, + Comment: knftables.PtrTo("chain to manage traffic masquerading by flannel"), + Type: knftables.PtrTo(knftables.NATType), + Hook: knftables.PtrTo(knftables.PostroutingHook), + Priority: knftables.PtrTo(knftables.SNATPriority), + }) + // make sure that the chain is empty before adding our rules + // => no need for the check and recycle part of iptables.go + tx.Flush(&knftables.Chain{ + Name: postrtgChain, + }) + err := nftm.addMasqRules(ctx, tx, flannelIPv4Net.String(), currentlease.Subnet.String(), knftables.IPv4Family) + if err != nil { + return fmt.Errorf("nftables: couldn't setup masq rules: %v", err) + } + err = nftm.nftv4.Run(ctx, tx) + if err != nil { + return fmt.Errorf("nftables: couldn't setup masq rules: %v", err) + } + } + if !flannelIPv6Net.Empty() { + log.Infof("nftables: setting up masking rules (ipv6)") + tx := nftm.nftv6.NewTransaction() + + tx.Add(&knftables.Chain{ + Name: postrtgChain, + Comment: knftables.PtrTo("chain to manage traffic masquerading by flannel"), + Type: knftables.PtrTo(knftables.NATType), + Hook: knftables.PtrTo(knftables.PostroutingHook), + Priority: knftables.PtrTo(knftables.SNATPriority), + }) + // make sure that the chain is empty before adding our rules + // => no need for the check and recycle part of iptables.go + tx.Flush(&knftables.Chain{ + Name: postrtgChain, + }) + err := nftm.addMasqRules(ctx, tx, flannelIPv6Net.String(), currentlease.IPv6Subnet.String(), knftables.IPv6Family) + if err != nil { + return fmt.Errorf("nftables: couldn't setup masq rules: %v", err) + } + err = nftm.nftv6.Run(ctx, tx) + if err != nil { + return fmt.Errorf("nftables: couldn't setup masq rules: %v", err) + } + } + return nil +} + +// add required masking rules to transaction tx +func (nftm *NFTablesManager) addMasqRules(ctx context.Context, + tx *knftables.Transaction, + clusterCidr, podCidr string, + family knftables.Family) error { + masquerade := "masquerade fully-random" + if !nftm.checkRandomfully(ctx) { + masquerade = "masquerade" + } + + multicastCidr := "224.0.0.0/4" + if family == knftables.IPv6Family { + multicastCidr = "ff00::/8" + } + // This rule will not masquerade traffic marked + // by the kube-proxy to avoid double NAT bug on some kernel version + tx.Add(&knftables.Rule{ + Chain: postrtgChain, + Rule: knftables.Concat( + "meta mark", "0x4000", //TODO_TF: check the correct value when deploying kube-proxy + "return", + ), + }) + // don't NAT traffic within overlay network + tx.Add(&knftables.Rule{ + Chain: postrtgChain, + Rule: knftables.Concat( + family, "saddr", podCidr, + family, "daddr", clusterCidr, + "return", + ), + }) + tx.Add(&knftables.Rule{ + Chain: postrtgChain, + Rule: knftables.Concat( + family, "saddr", clusterCidr, + family, "daddr", podCidr, + "return", + ), + }) + // Prevent performing Masquerade on external traffic which arrives from a Node that owns the container/pod IP address + tx.Add(&knftables.Rule{ + Chain: postrtgChain, + Rule: knftables.Concat( + family, "saddr", "!=", podCidr, + family, "daddr", clusterCidr, + "return", + ), + }) + // NAT if it's not multicast traffic + tx.Add(&knftables.Rule{ + Chain: postrtgChain, + Rule: knftables.Concat( + family, "saddr", clusterCidr, + family, "daddr", "!=", multicastCidr, + masquerade, + ), + }) + // Masquerade anything headed towards flannel from the host + tx.Add(&knftables.Rule{ + Chain: postrtgChain, + Rule: knftables.Concat( + family, "saddr", "!=", clusterCidr, + family, "daddr", clusterCidr, + masquerade, + ), + }) + return nil +} + +// clean-up all nftables states created by flannel by deleting all related tables +func (nftm *NFTablesManager) cleanUp(ctx context.Context) error { + nft, err := knftables.New(knftables.IPv4Family, ipv4Table) + if err == nil { + tx := nft.NewTransaction() + tx.Delete(&knftables.Table{}) + err = nft.Run(ctx, tx) + } + if err != nil { + return fmt.Errorf("nftables: couldn't delete table: %v", err) + } + + nft, err = knftables.New(knftables.IPv6Family, ipv6Table) + if err == nil { + tx := nft.NewTransaction() + tx.Delete(&knftables.Table{}) + err = nft.Run(ctx, tx) + } + if err != nil { + return fmt.Errorf("nftables (ipv6): couldn't delete table: %v", err) + } + + return nil +} diff --git a/pkg/trafficmngr/nftables/nftables_windows.go b/pkg/trafficmngr/nftables/nftables_windows.go new file mode 100644 index 0000000000..b66034b5b7 --- /dev/null +++ b/pkg/trafficmngr/nftables/nftables_windows.go @@ -0,0 +1,48 @@ +// Copyright 2024 flannel authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nftables + +import ( + "context" + "sync" + + log "k8s.io/klog/v2" + + "github.com/flannel-io/flannel/pkg/ip" + "github.com/flannel-io/flannel/pkg/lease" + "github.com/flannel-io/flannel/pkg/trafficmngr" +) + +type NFTablesManager struct { +} + +func (nftm *NFTablesManager) Init(ctx context.Context, wg *sync.WaitGroup) error { + log.Error(trafficmngr.ErrUnimplemented) + return nil +} + +func (nftm *NFTablesManager) SetupAndEnsureForwardRules(ctx context.Context, + flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) { +} + +func (nftm *NFTablesManager) SetupAndEnsureMasqRules(ctx context.Context, flannelIPv4Net, prevSubnet ip.IP4Net, + prevNetworks []ip.IP4Net, + flannelIPv6Net, prevIPv6Subnet ip.IP6Net, + prevIPv6Networks []ip.IP6Net, + currentlease *lease.Lease, + resyncPeriod int) error { + log.Error(trafficmngr.ErrUnimplemented) + return nil +} diff --git a/pkg/trafficmngr/nftables/utils.go b/pkg/trafficmngr/nftables/utils.go new file mode 100644 index 0000000000..fa3f269361 --- /dev/null +++ b/pkg/trafficmngr/nftables/utils.go @@ -0,0 +1,58 @@ +// Copyright 2024 flannel authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//go:build !windows +// +build !windows + +package nftables + +import ( + "context" + + log "k8s.io/klog/v2" + "sigs.k8s.io/knftables" +) + +const ( + masqueradeTestTable = "masqueradeTest" +) + +// check whether masquerade fully-random is supported by the kernel +func (nftm *NFTablesManager) checkRandomfully(ctx context.Context) bool { + result := true + tx := nftm.nftv4.NewTransaction() + tx.Add(&knftables.Chain{ + Name: masqueradeTestTable, + Comment: knftables.PtrTo("chain to test if masquerade random fully is supported"), + Type: knftables.PtrTo(knftables.NATType), + Hook: knftables.PtrTo(knftables.PostroutingHook), + Priority: knftables.PtrTo(knftables.SNATPriority), + }) + tx.Flush(&knftables.Chain{ + Name: masqueradeTestTable, + }) + // Masquerade anything headed towards flannel from the host + tx.Add(&knftables.Rule{ + Chain: masqueradeTestTable, + Rule: knftables.Concat( + "ip saddr", "!=", "127.0.0.1", + "masquerade fully-random", + ), + }) + err := nftm.nftv4.Check(ctx, tx) + if err != nil { + log.Warningf("nftables: random fully unsupported") + result = false + } + return result +} diff --git a/pkg/trafficmngr/trafficmngr.go b/pkg/trafficmngr/trafficmngr.go index 7ae2ef7ae0..99f401c4c5 100644 --- a/pkg/trafficmngr/trafficmngr.go +++ b/pkg/trafficmngr/trafficmngr.go @@ -15,6 +15,10 @@ package trafficmngr import ( + "context" + "errors" + "sync" + "github.com/flannel-io/flannel/pkg/ip" "github.com/flannel-io/flannel/pkg/lease" ) @@ -26,19 +30,27 @@ type IPTablesRule struct { Rulespec []string } +var ( + ErrUnimplemented = errors.New("unimplemented") +) + +const KubeProxyMark string = "0x4000/0x4000" + type TrafficManager interface { + // Initialize the TrafficManager, including the go routine to clean-up when flanneld is closed + Init(ctx context.Context, wg *sync.WaitGroup) error // Install kernel rules to forward the traffic to and from the flannel network range. // This is done for IPv4 and/or IPv6 based on whether flannelIPv4Network and flannelIPv6Network are set. // SetupAndEnsureForwardRules starts a go routine that // rewrites these rules every resyncPeriod seconds if needed - SetupAndEnsureForwardRules(flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) + SetupAndEnsureForwardRules(ctx context.Context, flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) // Install kernel rules to setup NATing of packets sent to the flannel interface // This is done for IPv4 and/or IPv6 based on whether flannelIPv4Network and flannelIPv6Network are set. // prevSubnet,prevNetworks, prevIPv6Subnet, prevIPv6Networks are used // to determine whether the existing rules need to be replaced. // SetupAndEnsureMasqRules starts a go routine that // rewrites these rules every resyncPeriod seconds if needed - SetupAndEnsureMasqRules( + SetupAndEnsureMasqRules(ctx context.Context, flannelIPv4Net, prevSubnet ip.IP4Net, prevNetworks []ip.IP4Net, flannelIPv6Net, prevIPv6Subnet ip.IP6Net,