diff --git a/Documentation/configuration.md b/Documentation/configuration.md index a5ea6637f0..2a59019640 100644 --- a/Documentation/configuration.md +++ b/Documentation/configuration.md @@ -19,6 +19,9 @@ The value of the config is a JSON dictionary with the following keys: * `EnableIPv6` (bool): Enables ipv6 support Defaults to `false` +* `EnableNFTables` (bool): (EXPERIMENTAL) If set to true, flannel uses nftables instead of iptables to masquerade the traffic. + Default to `false` + * `SubnetLen` (integer): The size of the subnet allocated to each host. Defaults to 24 (i.e. /24) unless `Network` was configured to be smaller than a /22 in which case it is two less than the network. @@ -128,3 +131,22 @@ FLANNEL_IPMASQ=true ## IPv6 only To use an IPv6-only environment use the same configuration of the Dual-stack section to enable IPv6 and add "EnableIPv4": false in the net-conf.json of the kube-flannel-cfg ConfigMap. In case of IPv6-only setup, please use the docker.io IPv6-only endpoint as described in the following link: https://www.docker.com/blog/beta-ipv6-support-on-docker-hub-registry/ + +## nftables mode +To enable `nftables` mode in flannel, set `EnableNFTables` to true in flannel configuration. + +Note: to test with kube-proxy, use kubeadm with the following configuration: +```yaml +apiVersion: kubeadm.k8s.io/v1beta3 +kind: ClusterConfiguration +kubernetesVersion: v1.29.0 +controllerManager: + extraArgs: + feature-gates: NFTablesProxyMode=true +--- +apiVersion: kubeproxy.config.k8s.io/v1alpha1 +kind: KubeProxyConfiguration +mode: "nftables" +featureGates: + NFTablesProxyMode: true +``` diff --git a/Documentation/kube-flannel.yml b/Documentation/kube-flannel.yml index e3a9c816bd..be0266e28c 100644 --- a/Documentation/kube-flannel.yml +++ b/Documentation/kube-flannel.yml @@ -98,6 +98,7 @@ data: net-conf.json: | { "Network": "10.244.0.0/16", + "EnableNFTables": false, "Backend": { "Type": "vxlan" } diff --git a/Makefile b/Makefile index eaf698a58a..6ef7823fcf 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ QEMU_VERSION=v3.0.0 BASH_UNIT_VERSION=v2.3.0 # Default tag and architecture. Can be overridden -TAG?=$(shell git describe --tags --always) +TAG?=$(shell git describe --tags --dirty --always) ARCH?=amd64 # Only enable CGO (and build the UDP backend) on AMD64 ifeq ($(ARCH),amd64) diff --git a/e2e/Dockerfile b/e2e/Dockerfile index 41f6bf6916..09cf9bba6c 100644 --- a/e2e/Dockerfile +++ b/e2e/Dockerfile @@ -11,6 +11,7 @@ RUN set -x \ curl \ tar gzip\ iptables \ + nftables \ iproute2 \ iputils \ && if [ "${ARCH?required}" != "amd64" ]; then \ diff --git a/e2e/run-e2e-tests.sh b/e2e/run-e2e-tests.sh index d23dfa3bbe..bb4a51eb50 100644 --- a/e2e/run-e2e-tests.sh +++ b/e2e/run-e2e-tests.sh @@ -38,11 +38,12 @@ EOF write-flannel-conf(){ local backend=$1 + local enable_nftables=$2 cp ../Documentation/kube-flannel.yml ./kube-flannel.yml yq -i 'select(.kind == "DaemonSet").spec.template.spec.containers[0].image |= strenv(FLANNEL_IMAGE)' ./kube-flannel.yml yq -i 'select(.kind == "DaemonSet").spec.template.spec.initContainers[1].image |= strenv(FLANNEL_IMAGE)' ./kube-flannel.yml - export flannel_conf="{ \"Network\": \"$FLANNEL_NET\", \"Backend\": { \"Type\": \"${backend}\" } }" + export flannel_conf="{ \"Network\": \"$FLANNEL_NET\", \"Backend\": { \"Type\": \"${backend}\" }, \"EnableNFTables\": ${enable_nftables} }" yq -i 'select(.metadata.name == "kube-flannel-cfg").data."net-conf.json" |= strenv(flannel_conf)' ./kube-flannel.yml @@ -55,10 +56,11 @@ write-flannel-conf(){ # This is not used at the moment since github runners don't support dual-stack networking write-flannel-conf-dual-stack(){ local backend=$1 + local enable_nftables=$2 cp ../Documentation/kube-flannel.yml ./kube-flannel.yml yq -i 'select(.kind == "DaemonSet").spec.template.spec.containers[0].image |= strenv(FLANNEL_IMAGE)' ./kube-flannel.yml - export flannel_conf="{ \"EnableIPv6\": true, \"Network\": \"$FLANNEL_NET\", \"IPv6Network\":\"${FLANNEL_IP6NET}\", \"Backend\": { \"Type\": \"${backend}\" } }" + export flannel_conf="{ \"EnableIPv6\": true, \"Network\": \"$FLANNEL_NET\", \"IPv6Network\":\"${FLANNEL_IP6NET}\", \"Backend\": { \"Type\": \"${backend}\" }, \"EnableNFTables\": ${enable_nftables} }" yq -i 'select(.metadata.name == "kube-flannel-cfg").data."net-conf.json" |= strenv(flannel_conf)' ./kube-flannel.yml } @@ -67,6 +69,10 @@ install-flannel() { kubectl --kubeconfig="${HOME}/.kube/config" apply -f ./kube-flannel.yml } +delete-flannel() { + kubectl --kubeconfig="${HOME}/.kube/config" delete -f ./kube-flannel.yml +} + get_pod_ip() { local pod_name=$1 kubectl --kubeconfig="${HOME}/.kube/config" get pod ${pod_name} --template '{{.status.podIP}}' @@ -125,8 +131,9 @@ perf() { prepare_test() { local backend=$1 + local enable_nftables=${2:-false} # install flannel version to test - write-flannel-conf ${backend} + write-flannel-conf ${backend} ${enable_nftables} install-flannel # wait for nodes to be ready @@ -150,18 +157,32 @@ test_vxlan() { prepare_test vxlan pings check_iptables + delete-flannel + check_iptables_removed +} + +test_vxlan_nft() { + prepare_test vxlan true + pings + check_nftables + delete-flannel + check_nftables_removed } test_wireguard() { prepare_test wireguard pings check_iptables + delete-flannel + check_iptables_removed } test_host-gw() { prepare_test host-gw pings check_iptables + delete-flannel + check_iptables_removed } if [[ ${ARCH} == "amd64" ]]; then @@ -169,6 +190,8 @@ test_udp() { prepare_test udp pings check_iptables + delete-flannel + check_iptables_removed } fi @@ -176,6 +199,8 @@ test_ipip() { prepare_test ipip pings check_iptables + delete-flannel + check_iptables_removed } test_perf_vxlan() { @@ -260,3 +285,95 @@ $(docker exec --privileged local-worker /usr/sbin/iptables -t filter -S FLANNEL- "$(docker exec --privileged local-leader /usr/sbin/iptables -t filter -S FORWARD) $(docker exec --privileged local-leader /usr/sbin/iptables -t filter -S FLANNEL-FWD)" "Host 2 has not expected forward rules" } + +check_iptables_removed() { + local worker_podcidr=$(get_pod_cidr local-worker) + local leader_pod_cidr=$(get_pod_cidr local-leader) + read -r -d '' POSTROUTING_RULES_WORKER << EOM +-N FLANNEL-POSTRTG +EOM + read -r -d '' POSTROUTING_RULES_LEADER << EOM +-N FLANNEL-POSTRTG +EOM + read -r -d '' FORWARD_RULES << EOM +-P FORWARD ACCEPT +-A FORWARD -m conntrack --ctstate NEW -m comment --comment "kubernetes load balancer firewall" -j KUBE-PROXY-FIREWALL +-A FORWARD -m comment --comment "kubernetes forwarding rules" -j KUBE-FORWARD +-A FORWARD -m conntrack --ctstate NEW -m comment --comment "kubernetes service portals" -j KUBE-SERVICES +-A FORWARD -m conntrack --ctstate NEW -m comment --comment "kubernetes externally-visible service portals" -j KUBE-EXTERNAL-SERVICES +-N FLANNEL-FWD +EOM +# check that masquerade & forward rules have been removed + assert_equals "$POSTROUTING_RULES_WORKER" \ + "$(docker exec --privileged local-worker /usr/sbin/iptables -t nat -S POSTROUTING | grep FLANNEL)$(docker exec --privileged local-worker /usr/sbin/iptables -t nat -S FLANNEL-POSTRTG)" "Host 1 has not expected postrouting rules" + assert_equals "$POSTROUTING_RULES_LEADER" \ + "$(docker exec --privileged local-leader /usr/sbin/iptables -t nat -S POSTROUTING | grep FLANNEL)$(docker exec --privileged local-leader /usr/sbin/iptables -t nat -S FLANNEL-POSTRTG)" "Host 2 has not expected postrouting rules" + assert_equals "$FORWARD_RULES" \ + "$(docker exec --privileged local-worker /usr/sbin/iptables -t filter -S FORWARD) +$(docker exec --privileged local-worker /usr/sbin/iptables -t filter -S FLANNEL-FWD -w 5)" "Host 1 has not expected forward rules" + assert_equals "$FORWARD_RULES" \ + "$(docker exec --privileged local-leader /usr/sbin/iptables -t filter -S FORWARD) +$(docker exec --privileged local-leader /usr/sbin/iptables -t filter -S FLANNEL-FWD)" "Host 2 has not expected forward rules" +} + +###nftables +check_nftables() { + local worker_podcidr=$(get_pod_cidr local-worker) + local leader_podcidr=$(get_pod_cidr local-leader) + read -d '' POSTROUTING_RULES_WORKER << EOM +table ip flannel-ipv4 { + chain postrtg { + type nat hook postrouting priority srcnat; policy accept; + meta mark 0x00004000 return + ip saddr ${worker_podcidr} ip daddr 10.42.0.0/16 return + ip saddr 10.42.0.0/16 ip daddr ${worker_podcidr} return + ip saddr != ${worker_podcidr} ip daddr 10.42.0.0/16 return + ip saddr 10.42.0.0/16 ip daddr != 224.0.0.0/4 masquerade fully-random + ip saddr != 10.42.0.0/16 ip daddr 10.42.0.0/16 masquerade fully-random + } +} +EOM + read -r -d '' POSTROUTING_RULES_LEADER << EOM +table ip flannel-ipv4 { + chain postrtg { + type nat hook postrouting priority srcnat; policy accept; + meta mark 0x00004000 return + ip saddr ${leader_podcidr} ip daddr 10.42.0.0/16 return + ip saddr 10.42.0.0/16 ip daddr ${leader_podcidr} return + ip saddr != ${leader_podcidr} ip daddr 10.42.0.0/16 return + ip saddr 10.42.0.0/16 ip daddr != 224.0.0.0/4 masquerade fully-random + ip saddr != 10.42.0.0/16 ip daddr 10.42.0.0/16 masquerade fully-random + } +} +EOM + read -r -d '' FORWARD_RULES << EOM +table ip flannel-ipv4 { + chain forward { + type filter hook input priority filter; policy accept; + ip saddr 10.42.0.0/16 accept + ip daddr 10.42.0.0/16 accept + } +} +EOM + # check masquerade & forward rules + assert_equals "$POSTROUTING_RULES_WORKER" \ + "$(docker exec --privileged local-worker /usr/sbin/nft list chain flannel-ipv4 postrtg)" "Node worker does not have expected postrouting rules" + assert_equals "$POSTROUTING_RULES_LEADER" \ + "$(docker exec --privileged local-leader /usr/sbin/nft list chain flannel-ipv4 postrtg)" "Node leader does not have expected postrouting rules" + assert_equals "$FORWARD_RULES" \ + "$(docker exec --privileged local-worker /usr/sbin/nft list chain flannel-ipv4 forward)" "Node worker does not have expected forward rules" + assert_equals "$FORWARD_RULES" \ + "$(docker exec --privileged local-leader /usr/sbin/nft list chain flannel-ipv4 forward)" "Node leader does not have expected forward rules" +} + +check_nftables_removed() { + # check masquerade & forward rules + assert_equals "" \ + "$(docker exec --privileged local-worker /usr/sbin/nft list chain flannel-ipv4 postrtg)" "Node worker has unexpected postrouting rules" + assert_equals "" \ + "$(docker exec --privileged local-leader /usr/sbin/nft list chain flannel-ipv4 postrtg)" "Node leader has unexpected postrouting rules" + assert_equals "" \ + "$(docker exec --privileged local-worker /usr/sbin/nft list chain flannel-ipv4 forward)" "Node worker has unexpected forward rules" + assert_equals "" \ + "$(docker exec --privileged local-leader /usr/sbin/nft list chain flannel-ipv4 forward)" "Node leader has unexpected forward rules" +} \ No newline at end of file diff --git a/go.mod b/go.mod index 6d9205d63d..d41b5a0674 100644 --- a/go.mod +++ b/go.mod @@ -35,6 +35,7 @@ require ( github.com/avast/retry-go/v4 v4.5.1 github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/common v1.0.872 github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/vpc v1.0.872 + sigs.k8s.io/knftables v0.0.14 ) require ( diff --git a/go.sum b/go.sum index 520673ae92..8293d45f32 100644 --- a/go.sum +++ b/go.sum @@ -255,6 +255,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/lithammer/dedent v1.1.0 h1:VNzHMVCBNG1j0fh3OrsFRkVUwStdDArbgBWoPAffktY= +github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.7.6 h1:8yTIVnZgCoiM1TgqoeTl+LfU5Jg6/xL3QhGQnimLYnA= @@ -781,6 +783,8 @@ rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k= sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= +sigs.k8s.io/knftables v0.0.14 h1:VzKQoDMCGBOH8c85sGrWSXSPCS0XrIpEfOlcCLBXiC0= +sigs.k8s.io/knftables v0.0.14/go.mod h1:f/5ZLKYEUPUhVjUCg6l80ACdL7CIIyeL0DxfgojGRTk= sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E= sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= diff --git a/images/Dockerfile b/images/Dockerfile index fd60008878..3c8d248120 100644 --- a/images/Dockerfile +++ b/images/Dockerfile @@ -27,7 +27,7 @@ RUN export GOOS=$(xx-info os) &&\ FROM alpine:20231219 RUN apk update && apk upgrade -RUN apk add --no-cache iproute2 ca-certificates iptables strongswan iptables-legacy && update-ca-certificates +RUN apk add --no-cache iproute2 ca-certificates nftables iptables strongswan iptables-legacy && update-ca-certificates RUN apk add wireguard-tools --no-cache --repository http://dl-cdn.alpinelinux.org/alpine/edge/community COPY --from=build /build/dist/flanneld /opt/bin/flanneld COPY dist/mk-docker-opts.sh /opt/bin/ diff --git a/main.go b/main.go index 94a95ba80e..702a2b476a 100644 --- a/main.go +++ b/main.go @@ -36,6 +36,7 @@ import ( "github.com/flannel-io/flannel/pkg/subnet/kube" "github.com/flannel-io/flannel/pkg/trafficmngr" "github.com/flannel-io/flannel/pkg/trafficmngr/iptables" + "github.com/flannel-io/flannel/pkg/trafficmngr/nftables" "github.com/flannel-io/flannel/pkg/version" "golang.org/x/net/context" log "k8s.io/klog/v2" @@ -336,7 +337,15 @@ func main() { } //Create TrafficManager and instanciate it based on whether we use iptables or nftables - trafficMngr := newTrafficManager() + trafficMngr := newTrafficManager(config.EnableNFTables) + err = trafficMngr.Init(ctx, &wg) + if err != nil { + log.Error(err) + cancel() + wg.Wait() + os.Exit(1) + } + flannelIPv4Net := ip.IP4Net{} flannelIpv6Net := ip.IP6Net{} if config.EnableIPv4 { @@ -365,7 +374,8 @@ func main() { prevIPv6Networks := ReadIP6CIDRsFromSubnetFile(opts.subnetFile, "FLANNEL_IPV6_NETWORK") prevIPv6Subnet := ReadIP6CIDRFromSubnetFile(opts.subnetFile, "FLANNEL_IPV6_SUBNET") - err = trafficMngr.SetupAndEnsureMasqRules(flannelIPv4Net, prevSubnet, + err = trafficMngr.SetupAndEnsureMasqRules(ctx, + flannelIPv4Net, prevSubnet, prevNetworks, flannelIpv6Net, prevIPv6Subnet, prevIPv6Networks, @@ -383,7 +393,7 @@ func main() { // In Docker 1.12 and earlier, the default FORWARD chain policy was ACCEPT. // In Docker 1.13 and later, Docker sets the default policy of the FORWARD chain to DROP. if opts.iptablesForwardRules { - trafficMngr.SetupAndEnsureForwardRules( + trafficMngr.SetupAndEnsureForwardRules(ctx, flannelIPv4Net, flannelIpv6Net, opts.iptablesResyncSeconds) @@ -569,6 +579,13 @@ func ReadIP6CIDRsFromSubnetFile(path string, CIDRKey string) []ip.IP6Net { return prevCIDRs } -func newTrafficManager() trafficmngr.TrafficManager { - return iptables.IPTablesManager{} +func newTrafficManager(useNftables bool) trafficmngr.TrafficManager { + if useNftables { + log.Info("Starting flannel in nftables mode") + return &nftables.NFTablesManager{} + } else { + log.Info("Starting flannel in iptables mode") + return &iptables.IPTablesManager{} + + } } diff --git a/pkg/subnet/config.go b/pkg/subnet/config.go index 5b8357cec4..b7a1afe270 100644 --- a/pkg/subnet/config.go +++ b/pkg/subnet/config.go @@ -27,20 +27,21 @@ import ( ) type Config struct { - EnableIPv4 bool - EnableIPv6 bool - Network ip.IP4Net - IPv6Network ip.IP6Net - Networks []ip.IP4Net - IPv6Networks []ip.IP6Net - SubnetMin ip.IP4 - SubnetMax ip.IP4 - IPv6SubnetMin *ip.IP6 - IPv6SubnetMax *ip.IP6 - SubnetLen uint - IPv6SubnetLen uint - BackendType string `json:"-"` - Backend json.RawMessage `json:",omitempty"` + EnableIPv4 bool + EnableIPv6 bool + EnableNFTables bool + Network ip.IP4Net + IPv6Network ip.IP6Net + Networks []ip.IP4Net + IPv6Networks []ip.IP6Net + SubnetMin ip.IP4 + SubnetMax ip.IP4 + IPv6SubnetMin *ip.IP6 + IPv6SubnetMax *ip.IP6 + SubnetLen uint + IPv6SubnetLen uint + BackendType string `json:"-"` + Backend json.RawMessage `json:",omitempty"` } func parseBackendType(be json.RawMessage) (string, error) { diff --git a/pkg/trafficmngr/iptables/iptables.go b/pkg/trafficmngr/iptables/iptables.go index d2a9dd4bea..8126940d4d 100644 --- a/pkg/trafficmngr/iptables/iptables.go +++ b/pkg/trafficmngr/iptables/iptables.go @@ -17,7 +17,9 @@ package iptables import ( + "context" "fmt" + "sync" "time" "github.com/coreos/go-iptables/iptables" @@ -40,11 +42,67 @@ type IPTablesError interface { Error() string } -type IPTablesManager struct{} +type IPTablesManager struct { + ipv4Rules []trafficmngr.IPTablesRule + ipv6Rules []trafficmngr.IPTablesRule +} + +func (iptm *IPTablesManager) Init(ctx context.Context, wg *sync.WaitGroup) error { + iptm.ipv4Rules = make([]trafficmngr.IPTablesRule, 0, 10) + iptm.ipv6Rules = make([]trafficmngr.IPTablesRule, 0, 10) + wg.Add(1) + go func() { + <-ctx.Done() + time.Sleep(time.Second) + err := iptm.cleanUp() + if err != nil { + log.Errorf("iptables: error while cleaning-up: %v", err) + } + wg.Done() + }() -const kubeProxyMark string = "0x4000/0x4000" + return nil +} + +func (iptm *IPTablesManager) cleanUp() error { + if len(iptm.ipv4Rules) > 0 { + ipt, err := iptables.New() + if err != nil { + // if we can't find iptables, give up and return + return fmt.Errorf("failed to setup IPTables. iptables binary was not found: %v", err) + } + iptRestore, err := NewIPTablesRestoreWithProtocol(iptables.ProtocolIPv4) + if err != nil { + // if we can't find iptables-restore, give up and return + return fmt.Errorf("failed to setup IPTables. iptables-restore binary was not found: %v", err) + } + log.Info("iptables (ipv4): cleaning-up before exiting flannel...") + err = teardownIPTables(ipt, iptRestore, iptm.ipv4Rules) + if err != nil { + log.Errorf("Failed to tear down IPTables: %v", err) + } + } + if len(iptm.ipv6Rules) > 0 { + ipt, err := iptables.NewWithProtocol(iptables.ProtocolIPv6) + if err != nil { + // if we can't find iptables, give up and return + return fmt.Errorf("failed to setup IPTables. iptables binary was not found: %v", err) + } + iptRestore, err := NewIPTablesRestoreWithProtocol(iptables.ProtocolIPv6) + if err != nil { + // if we can't find iptables-restore, give up and return + return fmt.Errorf("failed to setup IPTables. iptables-restore binary was not found: %v", err) + } + log.Info("iptables (ipv6): cleaning-up before exiting flannel...") + err = teardownIPTables(ipt, iptRestore, iptm.ipv6Rules) + if err != nil { + log.Errorf("Failed to tear down IPTables: %v", err) + } + } + return nil +} -func (iptm IPTablesManager) SetupAndEnsureMasqRules(flannelIPv4Net, prevSubnet ip.IP4Net, +func (iptm *IPTablesManager) SetupAndEnsureMasqRules(ctx context.Context, flannelIPv4Net, prevSubnet ip.IP4Net, prevNetworks []ip.IP4Net, flannelIPv6Net, prevIPv6Subnet ip.IP6Net, prevIPv6Networks []ip.IP6Net, @@ -78,7 +136,7 @@ func (iptm IPTablesManager) SetupAndEnsureMasqRules(flannelIPv4Net, prevSubnet i getRules := func() []trafficmngr.IPTablesRule { return iptm.masqRules([]ip.IP4Net{flannelIPv4Net}, currentlease) } - go iptm.setupAndEnsureIP4Tables(getRules, resyncPeriod) + go iptm.setupAndEnsureIP4Tables(ctx, getRules, resyncPeriod) } if !flannelIPv6Net.Empty() { //Find the cidr in FLANNEL_IPV6_NETWORK which contains the podCIDR (i.e. FLANNEL_IPV6_SUBNET) of this node @@ -107,12 +165,12 @@ func (iptm IPTablesManager) SetupAndEnsureMasqRules(flannelIPv4Net, prevSubnet i getRules := func() []trafficmngr.IPTablesRule { return iptm.masqIP6Rules([]ip.IP6Net{flannelIPv6Net}, currentlease) } - go iptm.setupAndEnsureIP6Tables(getRules, resyncPeriod) + go iptm.setupAndEnsureIP6Tables(ctx, getRules, resyncPeriod) } return nil } -func (iptm IPTablesManager) masqRules(cluster_cidrs []ip.IP4Net, lease *lease.Lease) []trafficmngr.IPTablesRule { +func (iptm *IPTablesManager) masqRules(cluster_cidrs []ip.IP4Net, lease *lease.Lease) []trafficmngr.IPTablesRule { pod_cidr := lease.Subnet.String() ipt, err := iptables.New() supports_random_fully := false @@ -123,7 +181,7 @@ func (iptm IPTablesManager) masqRules(cluster_cidrs []ip.IP4Net, lease *lease.Le // This rule ensure that the flannel iptables rules are executed before other rules on the node rules[0] = trafficmngr.IPTablesRule{Table: "nat", Action: "-A", Chain: "POSTROUTING", Rulespec: []string{"-m", "comment", "--comment", "flanneld masq", "-j", "FLANNEL-POSTRTG"}} // This rule will not masquerade traffic marked by the kube-proxy to avoid double NAT bug on some kernel version - rules[1] = trafficmngr.IPTablesRule{Table: "nat", Action: "-A", Chain: "FLANNEL-POSTRTG", Rulespec: []string{"-m", "mark", "--mark", kubeProxyMark, "-m", "comment", "--comment", "flanneld masq", "-j", "RETURN"}} + rules[1] = trafficmngr.IPTablesRule{Table: "nat", Action: "-A", Chain: "FLANNEL-POSTRTG", Rulespec: []string{"-m", "mark", "--mark", trafficmngr.KubeProxyMark, "-m", "comment", "--comment", "flanneld masq", "-j", "RETURN"}} for _, ccidr := range cluster_cidrs { cluster_cidr := ccidr.String() // This rule makes sure we don't NAT traffic within overlay network (e.g. coming out of docker0), for any of the cluster_cidrs @@ -158,7 +216,7 @@ func (iptm IPTablesManager) masqRules(cluster_cidrs []ip.IP4Net, lease *lease.Le return rules } -func (iptm IPTablesManager) masqIP6Rules(cluster_cidrs []ip.IP6Net, lease *lease.Lease) []trafficmngr.IPTablesRule { +func (iptm *IPTablesManager) masqIP6Rules(cluster_cidrs []ip.IP6Net, lease *lease.Lease) []trafficmngr.IPTablesRule { pod_cidr := lease.IPv6Subnet.String() ipt, err := iptables.NewWithProtocol(iptables.ProtocolIPv6) supports_random_fully := false @@ -170,7 +228,7 @@ func (iptm IPTablesManager) masqIP6Rules(cluster_cidrs []ip.IP6Net, lease *lease // This rule ensure that the flannel iptables rules are executed before other rules on the node rules[0] = trafficmngr.IPTablesRule{Table: "nat", Action: "-A", Chain: "POSTROUTING", Rulespec: []string{"-m", "comment", "--comment", "flanneld masq", "-j", "FLANNEL-POSTRTG"}} // This rule will not masquerade traffic marked by the kube-proxy to avoid double NAT bug on some kernel version - rules[1] = trafficmngr.IPTablesRule{Table: "nat", Action: "-A", Chain: "FLANNEL-POSTRTG", Rulespec: []string{"-m", "mark", "--mark", kubeProxyMark, "-m", "comment", "--comment", "flanneld masq", "-j", "RETURN"}} + rules[1] = trafficmngr.IPTablesRule{Table: "nat", Action: "-A", Chain: "FLANNEL-POSTRTG", Rulespec: []string{"-m", "mark", "--mark", trafficmngr.KubeProxyMark, "-m", "comment", "--comment", "flanneld masq", "-j", "RETURN"}} for _, ccidr := range cluster_cidrs { cluster_cidr := ccidr.String() @@ -209,14 +267,14 @@ func (iptm IPTablesManager) masqIP6Rules(cluster_cidrs []ip.IP6Net, lease *lease return rules } -func (iptm IPTablesManager) SetupAndEnsureForwardRules(flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) { +func (iptm *IPTablesManager) SetupAndEnsureForwardRules(ctx context.Context, flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) { if !flannelIPv4Network.Empty() { log.Infof("Changing default FORWARD chain policy to ACCEPT") iptm.CreateIP4Chain("filter", "FLANNEL-FWD") getRules := func() []trafficmngr.IPTablesRule { return iptm.forwardRules(flannelIPv4Network.String()) } - go iptm.setupAndEnsureIP4Tables(getRules, resyncPeriod) + go iptm.setupAndEnsureIP4Tables(ctx, getRules, resyncPeriod) } if !flannelIPv6Network.Empty() { log.Infof("IPv6: Changing default FORWARD chain policy to ACCEPT") @@ -224,11 +282,11 @@ func (iptm IPTablesManager) SetupAndEnsureForwardRules(flannelIPv4Network ip.IP4 getRules := func() []trafficmngr.IPTablesRule { return iptm.forwardRules(flannelIPv6Network.String()) } - go iptm.setupAndEnsureIP6Tables(getRules, resyncPeriod) + go iptm.setupAndEnsureIP6Tables(ctx, getRules, resyncPeriod) } } -func (iptm IPTablesManager) forwardRules(flannelNetwork string) []trafficmngr.IPTablesRule { +func (iptm *IPTablesManager) forwardRules(flannelNetwork string) []trafficmngr.IPTablesRule { return []trafficmngr.IPTablesRule{ // This rule ensure that the flannel iptables rules are executed before other rules on the node {Table: "filter", Action: "-A", Chain: "FORWARD", Rulespec: []string{"-m", "comment", "--comment", "flanneld forward", "-j", "FLANNEL-FWD"}}, @@ -238,7 +296,7 @@ func (iptm IPTablesManager) forwardRules(flannelNetwork string) []trafficmngr.IP } } -func (iptm IPTablesManager) CreateIP4Chain(table, chain string) { +func (iptm *IPTablesManager) CreateIP4Chain(table, chain string) { ipt, err := iptables.New() if err != nil { // if we can't find iptables, give up and return @@ -253,7 +311,7 @@ func (iptm IPTablesManager) CreateIP4Chain(table, chain string) { } } -func (iptm IPTablesManager) CreateIP6Chain(table, chain string) { +func (iptm *IPTablesManager) CreateIP6Chain(table, chain string) { ipt, err := iptables.NewWithProtocol(iptables.ProtocolIPv6) if err != nil { // if we can't find iptables, give up and return @@ -368,9 +426,8 @@ func ipTablesBootstrap(ipt IPTables, iptRestore IPTablesRestore, rules []traffic return nil } -func (iptm IPTablesManager) setupAndEnsureIP4Tables(getRules func() []trafficmngr.IPTablesRule, resyncPeriod int) { +func (iptm *IPTablesManager) setupAndEnsureIP4Tables(ctx context.Context, getRules func() []trafficmngr.IPTablesRule, resyncPeriod int) { rules := getRules() - log.Infof("generated %d rules", len(rules)) ipt, err := iptables.New() if err != nil { // if we can't find iptables, give up and return @@ -390,24 +447,23 @@ func (iptm IPTablesManager) setupAndEnsureIP4Tables(getRules func() []trafficmng log.Errorf("Failed to bootstrap IPTables: %v", err) } - defer func() { - err := teardownIPTables(ipt, iptRestore, rules) - if err != nil { - log.Errorf("Failed to tear down IPTables: %v", err) - } - }() - + iptm.ipv4Rules = append(iptm.ipv4Rules, rules...) for { - // Ensure that all the iptables rules exist every 5 seconds - if err := ensureIPTables(ipt, iptRestore, getRules()); err != nil { - log.Errorf("Failed to ensure iptables rules: %v", err) + select { + case <-ctx.Done(): + //clean-up is setup in Init + return + case <-time.After(time.Duration(resyncPeriod) * time.Second): + // Ensure that all the iptables rules exist every 5 seconds + if err := ensureIPTables(ipt, iptRestore, rules); err != nil { + log.Errorf("Failed to ensure iptables rules: %v", err) + } } - time.Sleep(time.Duration(resyncPeriod) * time.Second) } } -func (iptm IPTablesManager) setupAndEnsureIP6Tables(getRules func() []trafficmngr.IPTablesRule, resyncPeriod int) { +func (iptm *IPTablesManager) setupAndEnsureIP6Tables(ctx context.Context, getRules func() []trafficmngr.IPTablesRule, resyncPeriod int) { rules := getRules() ipt, err := iptables.NewWithProtocol(iptables.ProtocolIPv6) if err != nil { @@ -427,26 +483,24 @@ func (iptm IPTablesManager) setupAndEnsureIP6Tables(getRules func() []trafficmng // if we can't find iptables, give up and return log.Errorf("Failed to bootstrap IPTables: %v", err) } - - defer func() { - err := teardownIPTables(ipt, iptRestore, rules) - if err != nil { - log.Errorf("Failed to tear down IPTables: %v", err) - } - }() + iptm.ipv6Rules = append(iptm.ipv6Rules, rules...) for { - // Ensure that all the iptables rules exist every 5 seconds - if err := ensureIPTables(ipt, iptRestore, getRules()); err != nil { - log.Errorf("Failed to ensure iptables rules: %v", err) + select { + case <-ctx.Done(): + //clean-up is setup in Init + return + case <-time.After(time.Duration(resyncPeriod) * time.Second): + // Ensure that all the iptables rules exist every 5 seconds + if err := ensureIPTables(ipt, iptRestore, getRules()); err != nil { + log.Errorf("Failed to ensure iptables rules: %v", err) + } } - - time.Sleep(time.Duration(resyncPeriod) * time.Second) } } // deleteIP4Tables delete specified iptables rules -func (iptm IPTablesManager) deleteIP4Tables(rules []trafficmngr.IPTablesRule) error { +func (iptm *IPTablesManager) deleteIP4Tables(rules []trafficmngr.IPTablesRule) error { ipt, err := iptables.New() if err != nil { // if we can't find iptables, give up and return @@ -468,7 +522,7 @@ func (iptm IPTablesManager) deleteIP4Tables(rules []trafficmngr.IPTablesRule) er } // deleteIP6Tables delete specified iptables rules -func (iptm IPTablesManager) deleteIP6Tables(rules []trafficmngr.IPTablesRule) error { +func (iptm *IPTablesManager) deleteIP6Tables(rules []trafficmngr.IPTablesRule) error { ipt, err := iptables.NewWithProtocol(iptables.ProtocolIPv6) if err != nil { // if we can't find iptables, give up and return @@ -539,6 +593,7 @@ func teardownIPTables(ipt IPTables, iptr IPTablesRestore, rules []trafficmngr.IP // this shouldn't ever happen return fmt.Errorf("failed to check rule existence: %v", err) } + if exists { if _, ok := tablesRules[rule.Table]; !ok { tablesRules[rule.Table] = []IPTablesRestoreRuleSpec{} diff --git a/pkg/trafficmngr/iptables/iptables_restore.go b/pkg/trafficmngr/iptables/iptables_restore.go index 75973a6212..aa6e42a587 100644 --- a/pkg/trafficmngr/iptables/iptables_restore.go +++ b/pkg/trafficmngr/iptables/iptables_restore.go @@ -23,6 +23,7 @@ import ( "os/exec" "regexp" "strconv" + "sync" "github.com/coreos/go-iptables/iptables" log "k8s.io/klog/v2" @@ -46,6 +47,10 @@ type ipTablesRestore struct { path string proto iptables.Protocol hasWait bool + // ipTablesRestore needs a mutex to ensure that two avoid + // collisions between two goroutines calling ApplyWithoutFlush in parallel. + // This could result in the second call accidentally restoring a rule removed by the first + mu sync.Mutex } // IPTablesRestoreRules represents iptables-restore table block @@ -75,12 +80,15 @@ func NewIPTablesRestoreWithProtocol(protocol iptables.Protocol) (IPTablesRestore path: path, proto: protocol, hasWait: hasWait, + mu: sync.Mutex{}, } return &ipt, nil } // ApplyWithoutFlush apply without flush chains func (iptr *ipTablesRestore) ApplyWithoutFlush(rules IPTablesRestoreRules) error { + iptr.mu.Lock() + defer iptr.mu.Unlock() payload := buildIPTablesRestorePayload(rules) log.V(6).Infof("trying to run with payload %s", payload) diff --git a/pkg/trafficmngr/iptables/iptables_windows.go b/pkg/trafficmngr/iptables/iptables_windows.go index c63900b4b9..4d55415a06 100644 --- a/pkg/trafficmngr/iptables/iptables_windows.go +++ b/pkg/trafficmngr/iptables/iptables_windows.go @@ -15,8 +15,14 @@ package iptables import ( + "context" + "sync" + + log "k8s.io/klog/v2" + "github.com/flannel-io/flannel/pkg/ip" "github.com/flannel-io/flannel/pkg/lease" + "github.com/flannel-io/flannel/pkg/trafficmngr" ) type IPTablesManager struct{} @@ -29,14 +35,19 @@ type IPTables interface { Exists(table string, chain string, rulespec ...string) (bool, error) } -func (iptm IPTablesManager) SetupAndEnsureForwardRules(flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) { +func (iptm IPTablesManager) Init(ctx context.Context, wg *sync.WaitGroup) error { + return trafficmngr.ErrUnimplemented +} + +func (iptm *IPTablesManager) SetupAndEnsureForwardRules(ctx context.Context, flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) { } -func (iptm IPTablesManager) SetupAndEnsureMasqRules(flannelIPv4Net, prevSubnet ip.IP4Net, +func (iptm *IPTablesManager) SetupAndEnsureMasqRules(ctx context.Context, flannelIPv4Net, prevSubnet ip.IP4Net, prevNetworks []ip.IP4Net, flannelIPv6Net, prevIPv6Subnet ip.IP6Net, prevIPv6Networks []ip.IP6Net, currentlease *lease.Lease, resyncPeriod int) error { + log.Error(trafficmngr.ErrUnimplemented) return nil } diff --git a/pkg/trafficmngr/nftables/nftables.go b/pkg/trafficmngr/nftables/nftables.go new file mode 100644 index 0000000000..41ab505ca6 --- /dev/null +++ b/pkg/trafficmngr/nftables/nftables.go @@ -0,0 +1,317 @@ +// Copyright 2024 flannel authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//go:build !windows +// +build !windows + +package nftables + +import ( + "context" + "fmt" + "sync" + "time" + + log "k8s.io/klog/v2" + + "github.com/flannel-io/flannel/pkg/ip" + "github.com/flannel-io/flannel/pkg/lease" + "sigs.k8s.io/knftables" +) + +const ( + ipv4Table = "flannel-ipv4" + ipv6Table = "flannel-ipv6" + forwardChain = "forward" + postrtgChain = "postrtg" + //maximum delay in second to clean-up when the context is cancelled + cleanUpDeadline = 15 +) + +type NFTablesManager struct { + nftv4 knftables.Interface + nftv6 knftables.Interface +} + +func (nftm *NFTablesManager) Init(ctx context.Context, wg *sync.WaitGroup) error { + var err error + nftm.nftv4, err = initTable(ctx, knftables.IPv4Family, ipv4Table) + if err != nil { + return err + } + nftm.nftv6, err = initTable(ctx, knftables.IPv6Family, ipv6Table) + if err != nil { + return err + } + + wg.Add(1) + go func() { + <-ctx.Done() + log.Info("Cleaning-up flannel tables...") + + cleanupCtx, cleanUpCancelFunc := context.WithTimeout(context.Background(), cleanUpDeadline*time.Second) + defer cleanUpCancelFunc() + err := nftm.cleanUp(cleanupCtx) + log.Errorf("nftables: error while cleaning-up: %v", err) + wg.Done() + }() + return nil +} + +// create a new table and returns the interface to interact with it +func initTable(ctx context.Context, ipFamily knftables.Family, name string) (knftables.Interface, error) { + nft, err := knftables.New(ipFamily, name) + if err != nil { + return nil, fmt.Errorf("no nftables support: %v", err) + } + tx := nft.NewTransaction() + + tx.Add(&knftables.Table{ + Comment: knftables.PtrTo("rules for " + name), + }) + err = nft.Run(ctx, tx) + if err != nil { + return nil, fmt.Errorf("nftables: couldn't initialise table %s: %v", name, err) + } + return nft, nil +} + +// It is needed when using nftables? accept seems to be the default +// warning: never add a default 'drop' policy on the forwarChain as it breaks connectivity to the node +func (nftm *NFTablesManager) SetupAndEnsureForwardRules(ctx context.Context, + flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) { + if !flannelIPv4Network.Empty() { + log.Infof("Changing default FORWARD chain policy to ACCEPT") + tx := nftm.nftv4.NewTransaction() + + //TODO how to express that the default is drop? + tx.Add(&knftables.Chain{ + Name: forwardChain, + Comment: knftables.PtrTo("chain to accept flannel traffic"), + Type: knftables.PtrTo(knftables.FilterType), + Hook: knftables.PtrTo(knftables.InputHook), + Priority: knftables.PtrTo(knftables.FilterPriority), + }) + tx.Flush(&knftables.Chain{ + Name: forwardChain, + }) + + tx.Add(&knftables.Rule{ + Chain: forwardChain, + Rule: knftables.Concat( + "ip saddr", flannelIPv4Network.String(), + "accept", + ), + }) + tx.Add(&knftables.Rule{ + Chain: forwardChain, + Rule: knftables.Concat( + "ip daddr", flannelIPv4Network.String(), + "accept", + ), + }) + err := nftm.nftv4.Run(ctx, tx) + if err != nil { + log.Errorf("nftables: couldn't setup forward rules: %v", err) + } + } + if !flannelIPv6Network.Empty() { + log.Infof("Changing default FORWARD chain policy to ACCEPT (ipv6)") + tx := nftm.nftv6.NewTransaction() + + //TODO how to express that the default is drop? + tx.Add(&knftables.Chain{ + Name: forwardChain, + Comment: knftables.PtrTo("chain to accept flannel traffic"), + Type: knftables.PtrTo(knftables.FilterType), + Hook: knftables.PtrTo(knftables.InputHook), + Priority: knftables.PtrTo(knftables.FilterPriority), + }) + tx.Flush(&knftables.Chain{ + Name: forwardChain, + }) + + tx.Add(&knftables.Rule{ + Chain: forwardChain, + Rule: knftables.Concat( + "ip6 saddr", flannelIPv6Network.String(), + "accept", + ), + }) + tx.Add(&knftables.Rule{ + Chain: forwardChain, + Rule: knftables.Concat( + "ip6 daddr", flannelIPv6Network.String(), + "accept", + ), + }) + err := nftm.nftv6.Run(ctx, tx) + if err != nil { + log.Errorf("nftables: couldn't setup forward rules (ipv6): %v", err) + } + } +} + +func (nftm *NFTablesManager) SetupAndEnsureMasqRules(ctx context.Context, flannelIPv4Net, prevSubnet ip.IP4Net, + prevNetworks []ip.IP4Net, + flannelIPv6Net, prevIPv6Subnet ip.IP6Net, + prevIPv6Networks []ip.IP6Net, + currentlease *lease.Lease, + resyncPeriod int) error { + if !flannelIPv4Net.Empty() { + log.Infof("nftables: setting up masking rules (ipv4)") + tx := nftm.nftv4.NewTransaction() + + tx.Add(&knftables.Chain{ + Name: postrtgChain, + Comment: knftables.PtrTo("chain to manage traffic masquerading by flannel"), + Type: knftables.PtrTo(knftables.NATType), + Hook: knftables.PtrTo(knftables.PostroutingHook), + Priority: knftables.PtrTo(knftables.SNATPriority), + }) + // make sure that the chain is empty before adding our rules + // => no need for the check and recycle part of iptables.go + tx.Flush(&knftables.Chain{ + Name: postrtgChain, + }) + err := nftm.addMasqRules(ctx, tx, flannelIPv4Net.String(), currentlease.Subnet.String(), knftables.IPv4Family) + if err != nil { + return fmt.Errorf("nftables: couldn't setup masq rules: %v", err) + } + err = nftm.nftv4.Run(ctx, tx) + if err != nil { + return fmt.Errorf("nftables: couldn't setup masq rules: %v", err) + } + } + if !flannelIPv6Net.Empty() { + log.Infof("nftables: setting up masking rules (ipv6)") + tx := nftm.nftv6.NewTransaction() + + tx.Add(&knftables.Chain{ + Name: postrtgChain, + Comment: knftables.PtrTo("chain to manage traffic masquerading by flannel"), + Type: knftables.PtrTo(knftables.NATType), + Hook: knftables.PtrTo(knftables.PostroutingHook), + Priority: knftables.PtrTo(knftables.SNATPriority), + }) + // make sure that the chain is empty before adding our rules + // => no need for the check and recycle part of iptables.go + tx.Flush(&knftables.Chain{ + Name: postrtgChain, + }) + err := nftm.addMasqRules(ctx, tx, flannelIPv6Net.String(), currentlease.IPv6Subnet.String(), knftables.IPv6Family) + if err != nil { + return fmt.Errorf("nftables: couldn't setup masq rules: %v", err) + } + err = nftm.nftv6.Run(ctx, tx) + if err != nil { + return fmt.Errorf("nftables: couldn't setup masq rules: %v", err) + } + } + return nil +} + +// add required masking rules to transaction tx +func (nftm *NFTablesManager) addMasqRules(ctx context.Context, + tx *knftables.Transaction, + clusterCidr, podCidr string, + family knftables.Family) error { + masquerade := "masquerade fully-random" + if !nftm.checkRandomfully(ctx) { + masquerade = "masquerade" + } + + multicastCidr := "224.0.0.0/4" + if family == knftables.IPv6Family { + multicastCidr = "ff00::/8" + } + // This rule will not masquerade traffic marked + // by the kube-proxy to avoid double NAT bug on some kernel version + tx.Add(&knftables.Rule{ + Chain: postrtgChain, + Rule: knftables.Concat( + "meta mark", "0x4000", //TODO_TF: check the correct value when deploying kube-proxy + "return", + ), + }) + // don't NAT traffic within overlay network + tx.Add(&knftables.Rule{ + Chain: postrtgChain, + Rule: knftables.Concat( + family, "saddr", podCidr, + family, "daddr", clusterCidr, + "return", + ), + }) + tx.Add(&knftables.Rule{ + Chain: postrtgChain, + Rule: knftables.Concat( + family, "saddr", clusterCidr, + family, "daddr", podCidr, + "return", + ), + }) + // Prevent performing Masquerade on external traffic which arrives from a Node that owns the container/pod IP address + tx.Add(&knftables.Rule{ + Chain: postrtgChain, + Rule: knftables.Concat( + family, "saddr", "!=", podCidr, + family, "daddr", clusterCidr, + "return", + ), + }) + // NAT if it's not multicast traffic + tx.Add(&knftables.Rule{ + Chain: postrtgChain, + Rule: knftables.Concat( + family, "saddr", clusterCidr, + family, "daddr", "!=", multicastCidr, + masquerade, + ), + }) + // Masquerade anything headed towards flannel from the host + tx.Add(&knftables.Rule{ + Chain: postrtgChain, + Rule: knftables.Concat( + family, "saddr", "!=", clusterCidr, + family, "daddr", clusterCidr, + masquerade, + ), + }) + return nil +} + +// clean-up all nftables states created by flannel by deleting all related tables +func (nftm *NFTablesManager) cleanUp(ctx context.Context) error { + nft, err := knftables.New(knftables.IPv4Family, ipv4Table) + if err == nil { + tx := nft.NewTransaction() + tx.Delete(&knftables.Table{}) + err = nft.Run(ctx, tx) + } + if err != nil { + return fmt.Errorf("nftables: couldn't delete table: %v", err) + } + + nft, err = knftables.New(knftables.IPv6Family, ipv6Table) + if err == nil { + tx := nft.NewTransaction() + tx.Delete(&knftables.Table{}) + err = nft.Run(ctx, tx) + } + if err != nil { + return fmt.Errorf("nftables (ipv6): couldn't delete table: %v", err) + } + + return nil +} diff --git a/pkg/trafficmngr/nftables/nftables_windows.go b/pkg/trafficmngr/nftables/nftables_windows.go new file mode 100644 index 0000000000..b66034b5b7 --- /dev/null +++ b/pkg/trafficmngr/nftables/nftables_windows.go @@ -0,0 +1,48 @@ +// Copyright 2024 flannel authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nftables + +import ( + "context" + "sync" + + log "k8s.io/klog/v2" + + "github.com/flannel-io/flannel/pkg/ip" + "github.com/flannel-io/flannel/pkg/lease" + "github.com/flannel-io/flannel/pkg/trafficmngr" +) + +type NFTablesManager struct { +} + +func (nftm *NFTablesManager) Init(ctx context.Context, wg *sync.WaitGroup) error { + log.Error(trafficmngr.ErrUnimplemented) + return nil +} + +func (nftm *NFTablesManager) SetupAndEnsureForwardRules(ctx context.Context, + flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) { +} + +func (nftm *NFTablesManager) SetupAndEnsureMasqRules(ctx context.Context, flannelIPv4Net, prevSubnet ip.IP4Net, + prevNetworks []ip.IP4Net, + flannelIPv6Net, prevIPv6Subnet ip.IP6Net, + prevIPv6Networks []ip.IP6Net, + currentlease *lease.Lease, + resyncPeriod int) error { + log.Error(trafficmngr.ErrUnimplemented) + return nil +} diff --git a/pkg/trafficmngr/nftables/utils.go b/pkg/trafficmngr/nftables/utils.go new file mode 100644 index 0000000000..fa3f269361 --- /dev/null +++ b/pkg/trafficmngr/nftables/utils.go @@ -0,0 +1,58 @@ +// Copyright 2024 flannel authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//go:build !windows +// +build !windows + +package nftables + +import ( + "context" + + log "k8s.io/klog/v2" + "sigs.k8s.io/knftables" +) + +const ( + masqueradeTestTable = "masqueradeTest" +) + +// check whether masquerade fully-random is supported by the kernel +func (nftm *NFTablesManager) checkRandomfully(ctx context.Context) bool { + result := true + tx := nftm.nftv4.NewTransaction() + tx.Add(&knftables.Chain{ + Name: masqueradeTestTable, + Comment: knftables.PtrTo("chain to test if masquerade random fully is supported"), + Type: knftables.PtrTo(knftables.NATType), + Hook: knftables.PtrTo(knftables.PostroutingHook), + Priority: knftables.PtrTo(knftables.SNATPriority), + }) + tx.Flush(&knftables.Chain{ + Name: masqueradeTestTable, + }) + // Masquerade anything headed towards flannel from the host + tx.Add(&knftables.Rule{ + Chain: masqueradeTestTable, + Rule: knftables.Concat( + "ip saddr", "!=", "127.0.0.1", + "masquerade fully-random", + ), + }) + err := nftm.nftv4.Check(ctx, tx) + if err != nil { + log.Warningf("nftables: random fully unsupported") + result = false + } + return result +} diff --git a/pkg/trafficmngr/trafficmngr.go b/pkg/trafficmngr/trafficmngr.go index 7ae2ef7ae0..99f401c4c5 100644 --- a/pkg/trafficmngr/trafficmngr.go +++ b/pkg/trafficmngr/trafficmngr.go @@ -15,6 +15,10 @@ package trafficmngr import ( + "context" + "errors" + "sync" + "github.com/flannel-io/flannel/pkg/ip" "github.com/flannel-io/flannel/pkg/lease" ) @@ -26,19 +30,27 @@ type IPTablesRule struct { Rulespec []string } +var ( + ErrUnimplemented = errors.New("unimplemented") +) + +const KubeProxyMark string = "0x4000/0x4000" + type TrafficManager interface { + // Initialize the TrafficManager, including the go routine to clean-up when flanneld is closed + Init(ctx context.Context, wg *sync.WaitGroup) error // Install kernel rules to forward the traffic to and from the flannel network range. // This is done for IPv4 and/or IPv6 based on whether flannelIPv4Network and flannelIPv6Network are set. // SetupAndEnsureForwardRules starts a go routine that // rewrites these rules every resyncPeriod seconds if needed - SetupAndEnsureForwardRules(flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) + SetupAndEnsureForwardRules(ctx context.Context, flannelIPv4Network ip.IP4Net, flannelIPv6Network ip.IP6Net, resyncPeriod int) // Install kernel rules to setup NATing of packets sent to the flannel interface // This is done for IPv4 and/or IPv6 based on whether flannelIPv4Network and flannelIPv6Network are set. // prevSubnet,prevNetworks, prevIPv6Subnet, prevIPv6Networks are used // to determine whether the existing rules need to be replaced. // SetupAndEnsureMasqRules starts a go routine that // rewrites these rules every resyncPeriod seconds if needed - SetupAndEnsureMasqRules( + SetupAndEnsureMasqRules(ctx context.Context, flannelIPv4Net, prevSubnet ip.IP4Net, prevNetworks []ip.IP4Net, flannelIPv6Net, prevIPv6Subnet ip.IP6Net,