Skip to content

Commit

Permalink
Implement nftables masquerading for flannel
Browse files Browse the repository at this point in the history
This PR allows flannel to use nftables natively instead of iptables.
This is used essentially to masquerade traffic coming from the pods.

The PR also fixes the clean-up mechanism in the iptables implementation.
  • Loading branch information
thomasferrandiz committed Mar 18, 2024
1 parent 4a496c9 commit 0e8ae59
Show file tree
Hide file tree
Showing 17 changed files with 744 additions and 71 deletions.
22 changes: 22 additions & 0 deletions Documentation/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ The value of the config is a JSON dictionary with the following keys:
* `EnableIPv6` (bool): Enables ipv6 support
Defaults to `false`

* `EnableNFTables` (bool): (EXPERIMENTAL) If set to true, flannel uses nftables instead of iptables to masquerade the traffic.
Default to `false`

* `SubnetLen` (integer): The size of the subnet allocated to each host.
Defaults to 24 (i.e. /24) unless `Network` was configured to be smaller than a /22 in which case it is two less than the network.

Expand Down Expand Up @@ -128,3 +131,22 @@ FLANNEL_IPMASQ=true
## IPv6 only
To use an IPv6-only environment use the same configuration of the Dual-stack section to enable IPv6 and add "EnableIPv4": false in the net-conf.json of the kube-flannel-cfg ConfigMap. In case of IPv6-only setup, please use the docker.io IPv6-only endpoint as described in the following link: https://www.docker.com/blog/beta-ipv6-support-on-docker-hub-registry/
## nftables mode
To enable `nftables` mode in flannel, set `EnableNFTables` to true in flannel configuration.
Note: to test with kube-proxy, use kubeadm with the following configuration:
```yaml
apiVersion: kubeadm.k8s.io/v1beta3
kind: ClusterConfiguration
kubernetesVersion: v1.29.0
controllerManager:
extraArgs:
feature-gates: NFTablesProxyMode=true
---
apiVersion: kubeproxy.config.k8s.io/v1alpha1
kind: KubeProxyConfiguration
mode: "nftables"
featureGates:
NFTablesProxyMode: true
```
1 change: 1 addition & 0 deletions Documentation/kube-flannel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ data:
net-conf.json: |
{
"Network": "10.244.0.0/16",
"EnableNFTables": false,
"Backend": {
"Type": "vxlan"
}
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ QEMU_VERSION=v3.0.0
BASH_UNIT_VERSION=v2.3.0

# Default tag and architecture. Can be overridden
TAG?=$(shell git describe --tags --always)
TAG?=$(shell git describe --tags --dirty --always)
ARCH?=amd64
# Only enable CGO (and build the UDP backend) on AMD64
ifeq ($(ARCH),amd64)
Expand Down
1 change: 1 addition & 0 deletions e2e/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ RUN set -x \
curl \
tar gzip\
iptables \
nftables \
iproute2 \
iputils \
&& if [ "${ARCH?required}" != "amd64" ]; then \
Expand Down
123 changes: 120 additions & 3 deletions e2e/run-e2e-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@ EOF

write-flannel-conf(){
local backend=$1
local enable_nftables=$2
cp ../Documentation/kube-flannel.yml ./kube-flannel.yml
yq -i 'select(.kind == "DaemonSet").spec.template.spec.containers[0].image |= strenv(FLANNEL_IMAGE)' ./kube-flannel.yml
yq -i 'select(.kind == "DaemonSet").spec.template.spec.initContainers[1].image |= strenv(FLANNEL_IMAGE)' ./kube-flannel.yml

export flannel_conf="{ \"Network\": \"$FLANNEL_NET\", \"Backend\": { \"Type\": \"${backend}\" } }"
export flannel_conf="{ \"Network\": \"$FLANNEL_NET\", \"Backend\": { \"Type\": \"${backend}\" }, \"EnableNFTables\": ${enable_nftables} }"

yq -i 'select(.metadata.name == "kube-flannel-cfg").data."net-conf.json" |= strenv(flannel_conf)' ./kube-flannel.yml

Expand All @@ -55,10 +56,11 @@ write-flannel-conf(){
# This is not used at the moment since github runners don't support dual-stack networking
write-flannel-conf-dual-stack(){
local backend=$1
local enable_nftables=$2
cp ../Documentation/kube-flannel.yml ./kube-flannel.yml
yq -i 'select(.kind == "DaemonSet").spec.template.spec.containers[0].image |= strenv(FLANNEL_IMAGE)' ./kube-flannel.yml

export flannel_conf="{ \"EnableIPv6\": true, \"Network\": \"$FLANNEL_NET\", \"IPv6Network\":\"${FLANNEL_IP6NET}\", \"Backend\": { \"Type\": \"${backend}\" } }"
export flannel_conf="{ \"EnableIPv6\": true, \"Network\": \"$FLANNEL_NET\", \"IPv6Network\":\"${FLANNEL_IP6NET}\", \"Backend\": { \"Type\": \"${backend}\" }, \"EnableNFTables\": ${enable_nftables} }"

yq -i 'select(.metadata.name == "kube-flannel-cfg").data."net-conf.json" |= strenv(flannel_conf)' ./kube-flannel.yml
}
Expand All @@ -67,6 +69,10 @@ install-flannel() {
kubectl --kubeconfig="${HOME}/.kube/config" apply -f ./kube-flannel.yml
}

delete-flannel() {
kubectl --kubeconfig="${HOME}/.kube/config" delete -f ./kube-flannel.yml
}

get_pod_ip() {
local pod_name=$1
kubectl --kubeconfig="${HOME}/.kube/config" get pod ${pod_name} --template '{{.status.podIP}}'
Expand Down Expand Up @@ -125,8 +131,9 @@ perf() {

prepare_test() {
local backend=$1
local enable_nftables=${2:-false}
# install flannel version to test
write-flannel-conf ${backend}
write-flannel-conf ${backend} ${enable_nftables}

install-flannel
# wait for nodes to be ready
Expand All @@ -150,32 +157,50 @@ test_vxlan() {
prepare_test vxlan
pings
check_iptables
delete-flannel
check_iptables_removed
}

test_vxlan_nft() {
prepare_test vxlan true
pings
check_nftables
delete-flannel
check_nftables_removed
}

test_wireguard() {
prepare_test wireguard
pings
check_iptables
delete-flannel
check_iptables_removed
}

test_host-gw() {
prepare_test host-gw
pings
check_iptables
delete-flannel
check_iptables_removed
}

if [[ ${ARCH} == "amd64" ]]; then
test_udp() {
prepare_test udp
pings
check_iptables
delete-flannel
check_iptables_removed
}
fi

test_ipip() {
prepare_test ipip
pings
check_iptables
delete-flannel
check_iptables_removed
}

test_perf_vxlan() {
Expand Down Expand Up @@ -260,3 +285,95 @@ $(docker exec --privileged local-worker /usr/sbin/iptables -t filter -S FLANNEL-
"$(docker exec --privileged local-leader /usr/sbin/iptables -t filter -S FORWARD)
$(docker exec --privileged local-leader /usr/sbin/iptables -t filter -S FLANNEL-FWD)" "Host 2 has not expected forward rules"
}

check_iptables_removed() {
local worker_podcidr=$(get_pod_cidr local-worker)
local leader_pod_cidr=$(get_pod_cidr local-leader)
read -r -d '' POSTROUTING_RULES_WORKER << EOM
-N FLANNEL-POSTRTG
EOM
read -r -d '' POSTROUTING_RULES_LEADER << EOM
-N FLANNEL-POSTRTG
EOM
read -r -d '' FORWARD_RULES << EOM
-P FORWARD ACCEPT
-A FORWARD -m conntrack --ctstate NEW -m comment --comment "kubernetes load balancer firewall" -j KUBE-PROXY-FIREWALL
-A FORWARD -m comment --comment "kubernetes forwarding rules" -j KUBE-FORWARD
-A FORWARD -m conntrack --ctstate NEW -m comment --comment "kubernetes service portals" -j KUBE-SERVICES
-A FORWARD -m conntrack --ctstate NEW -m comment --comment "kubernetes externally-visible service portals" -j KUBE-EXTERNAL-SERVICES
-N FLANNEL-FWD
EOM
# check that masquerade & forward rules have been removed
assert_equals "$POSTROUTING_RULES_WORKER" \
"$(docker exec --privileged local-worker /usr/sbin/iptables -t nat -S POSTROUTING | grep FLANNEL)$(docker exec --privileged local-worker /usr/sbin/iptables -t nat -S FLANNEL-POSTRTG)" "Host 1 has not expected postrouting rules"
assert_equals "$POSTROUTING_RULES_LEADER" \
"$(docker exec --privileged local-leader /usr/sbin/iptables -t nat -S POSTROUTING | grep FLANNEL)$(docker exec --privileged local-leader /usr/sbin/iptables -t nat -S FLANNEL-POSTRTG)" "Host 2 has not expected postrouting rules"
assert_equals "$FORWARD_RULES" \
"$(docker exec --privileged local-worker /usr/sbin/iptables -t filter -S FORWARD)
$(docker exec --privileged local-worker /usr/sbin/iptables -t filter -S FLANNEL-FWD -w 5)" "Host 1 has not expected forward rules"
assert_equals "$FORWARD_RULES" \
"$(docker exec --privileged local-leader /usr/sbin/iptables -t filter -S FORWARD)
$(docker exec --privileged local-leader /usr/sbin/iptables -t filter -S FLANNEL-FWD)" "Host 2 has not expected forward rules"
}

###nftables
check_nftables() {
local worker_podcidr=$(get_pod_cidr local-worker)
local leader_podcidr=$(get_pod_cidr local-leader)
read -d '' POSTROUTING_RULES_WORKER << EOM
table ip flannel-ipv4 {
chain postrtg {
type nat hook postrouting priority srcnat; policy accept;
meta mark 0x00004000 return
ip saddr ${worker_podcidr} ip daddr 10.42.0.0/16 return
ip saddr 10.42.0.0/16 ip daddr ${worker_podcidr} return
ip saddr != ${worker_podcidr} ip daddr 10.42.0.0/16 return
ip saddr 10.42.0.0/16 ip daddr != 224.0.0.0/4 masquerade fully-random
ip saddr != 10.42.0.0/16 ip daddr 10.42.0.0/16 masquerade fully-random
}
}
EOM
read -r -d '' POSTROUTING_RULES_LEADER << EOM
table ip flannel-ipv4 {
chain postrtg {
type nat hook postrouting priority srcnat; policy accept;
meta mark 0x00004000 return
ip saddr ${leader_podcidr} ip daddr 10.42.0.0/16 return
ip saddr 10.42.0.0/16 ip daddr ${leader_podcidr} return
ip saddr != ${leader_podcidr} ip daddr 10.42.0.0/16 return
ip saddr 10.42.0.0/16 ip daddr != 224.0.0.0/4 masquerade fully-random
ip saddr != 10.42.0.0/16 ip daddr 10.42.0.0/16 masquerade fully-random
}
}
EOM
read -r -d '' FORWARD_RULES << EOM
table ip flannel-ipv4 {
chain forward {
type filter hook input priority filter; policy accept;
ip saddr 10.42.0.0/16 accept
ip daddr 10.42.0.0/16 accept
}
}
EOM
# check masquerade & forward rules
assert_equals "$POSTROUTING_RULES_WORKER" \
"$(docker exec --privileged local-worker /usr/sbin/nft list chain flannel-ipv4 postrtg)" "Node worker does not have expected postrouting rules"
assert_equals "$POSTROUTING_RULES_LEADER" \
"$(docker exec --privileged local-leader /usr/sbin/nft list chain flannel-ipv4 postrtg)" "Node leader does not have expected postrouting rules"
assert_equals "$FORWARD_RULES" \
"$(docker exec --privileged local-worker /usr/sbin/nft list chain flannel-ipv4 forward)" "Node worker does not have expected forward rules"
assert_equals "$FORWARD_RULES" \
"$(docker exec --privileged local-leader /usr/sbin/nft list chain flannel-ipv4 forward)" "Node leader does not have expected forward rules"
}

check_nftables_removed() {
# check masquerade & forward rules
assert_equals "" \
"$(docker exec --privileged local-worker /usr/sbin/nft list chain flannel-ipv4 postrtg)" "Node worker has unexpected postrouting rules"
assert_equals "" \
"$(docker exec --privileged local-leader /usr/sbin/nft list chain flannel-ipv4 postrtg)" "Node leader has unexpected postrouting rules"
assert_equals "" \
"$(docker exec --privileged local-worker /usr/sbin/nft list chain flannel-ipv4 forward)" "Node worker has unexpected forward rules"
assert_equals "" \
"$(docker exec --privileged local-leader /usr/sbin/nft list chain flannel-ipv4 forward)" "Node leader has unexpected forward rules"
}
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ require (
github.com/avast/retry-go/v4 v4.5.1
github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/common v1.0.872
github.com/tencentcloud/tencentcloud-sdk-go/tencentcloud/vpc v1.0.872
sigs.k8s.io/knftables v0.0.14
)

require (
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/lithammer/dedent v1.1.0 h1:VNzHMVCBNG1j0fh3OrsFRkVUwStdDArbgBWoPAffktY=
github.com/lithammer/dedent v1.1.0/go.mod h1:jrXYCQtgg0nJiN+StA2KgR7w6CiQNv9Fd/Z9BP0jIOc=
github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.7.6 h1:8yTIVnZgCoiM1TgqoeTl+LfU5Jg6/xL3QhGQnimLYnA=
Expand Down Expand Up @@ -781,6 +783,8 @@ rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 h1:iXTIw73aPyC+oRdyqqvVJuloN1p0AC/kzH07hu3NE+k=
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
sigs.k8s.io/knftables v0.0.14 h1:VzKQoDMCGBOH8c85sGrWSXSPCS0XrIpEfOlcCLBXiC0=
sigs.k8s.io/knftables v0.0.14/go.mod h1:f/5ZLKYEUPUhVjUCg6l80ACdL7CIIyeL0DxfgojGRTk=
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE=
sigs.k8s.io/structured-merge-diff/v4 v4.2.3/go.mod h1:qjx8mGObPmV2aSZepjQjbmb2ihdVs8cGKBraizNC69E=
sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo=
Expand Down
2 changes: 1 addition & 1 deletion images/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN export GOOS=$(xx-info os) &&\

FROM alpine:20231219
RUN apk update && apk upgrade
RUN apk add --no-cache iproute2 ca-certificates iptables strongswan iptables-legacy && update-ca-certificates
RUN apk add --no-cache iproute2 ca-certificates nftables iptables strongswan iptables-legacy && update-ca-certificates
RUN apk add wireguard-tools --no-cache --repository http://dl-cdn.alpinelinux.org/alpine/edge/community
COPY --from=build /build/dist/flanneld /opt/bin/flanneld
COPY dist/mk-docker-opts.sh /opt/bin/
Expand Down
27 changes: 22 additions & 5 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"github.com/flannel-io/flannel/pkg/subnet/kube"
"github.com/flannel-io/flannel/pkg/trafficmngr"
"github.com/flannel-io/flannel/pkg/trafficmngr/iptables"
"github.com/flannel-io/flannel/pkg/trafficmngr/nftables"
"github.com/flannel-io/flannel/pkg/version"
"golang.org/x/net/context"
log "k8s.io/klog/v2"
Expand Down Expand Up @@ -336,7 +337,15 @@ func main() {
}

//Create TrafficManager and instanciate it based on whether we use iptables or nftables
trafficMngr := newTrafficManager()
trafficMngr := newTrafficManager(config.EnableNFTables)
err = trafficMngr.Init(ctx, &wg)
if err != nil {
log.Error(err)
cancel()
wg.Wait()
os.Exit(1)
}

flannelIPv4Net := ip.IP4Net{}
flannelIpv6Net := ip.IP6Net{}
if config.EnableIPv4 {
Expand Down Expand Up @@ -365,7 +374,8 @@ func main() {
prevIPv6Networks := ReadIP6CIDRsFromSubnetFile(opts.subnetFile, "FLANNEL_IPV6_NETWORK")
prevIPv6Subnet := ReadIP6CIDRFromSubnetFile(opts.subnetFile, "FLANNEL_IPV6_SUBNET")

err = trafficMngr.SetupAndEnsureMasqRules(flannelIPv4Net, prevSubnet,
err = trafficMngr.SetupAndEnsureMasqRules(ctx,
flannelIPv4Net, prevSubnet,
prevNetworks,
flannelIpv6Net, prevIPv6Subnet,
prevIPv6Networks,
Expand All @@ -383,7 +393,7 @@ func main() {
// In Docker 1.12 and earlier, the default FORWARD chain policy was ACCEPT.
// In Docker 1.13 and later, Docker sets the default policy of the FORWARD chain to DROP.
if opts.iptablesForwardRules {
trafficMngr.SetupAndEnsureForwardRules(
trafficMngr.SetupAndEnsureForwardRules(ctx,
flannelIPv4Net,
flannelIpv6Net,
opts.iptablesResyncSeconds)
Expand Down Expand Up @@ -569,6 +579,13 @@ func ReadIP6CIDRsFromSubnetFile(path string, CIDRKey string) []ip.IP6Net {
return prevCIDRs
}

func newTrafficManager() trafficmngr.TrafficManager {
return iptables.IPTablesManager{}
func newTrafficManager(useNftables bool) trafficmngr.TrafficManager {
if useNftables {
log.Info("Starting flannel in nftables mode")
return &nftables.NFTablesManager{}
} else {
log.Info("Starting flannel in iptables mode")
return &iptables.IPTablesManager{}

}
}
29 changes: 15 additions & 14 deletions pkg/subnet/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,21 @@ import (
)

type Config struct {
EnableIPv4 bool
EnableIPv6 bool
Network ip.IP4Net
IPv6Network ip.IP6Net
Networks []ip.IP4Net
IPv6Networks []ip.IP6Net
SubnetMin ip.IP4
SubnetMax ip.IP4
IPv6SubnetMin *ip.IP6
IPv6SubnetMax *ip.IP6
SubnetLen uint
IPv6SubnetLen uint
BackendType string `json:"-"`
Backend json.RawMessage `json:",omitempty"`
EnableIPv4 bool
EnableIPv6 bool
EnableNFTables bool
Network ip.IP4Net
IPv6Network ip.IP6Net
Networks []ip.IP4Net
IPv6Networks []ip.IP6Net
SubnetMin ip.IP4
SubnetMax ip.IP4
IPv6SubnetMin *ip.IP6
IPv6SubnetMax *ip.IP6
SubnetLen uint
IPv6SubnetLen uint
BackendType string `json:"-"`
Backend json.RawMessage `json:",omitempty"`
}

func parseBackendType(be json.RawMessage) (string, error) {
Expand Down
Loading

0 comments on commit 0e8ae59

Please sign in to comment.