Skip to content

Commit

Permalink
Fix return path of NodePort traffic.
Browse files Browse the repository at this point in the history
Add iptables and routing rules that

- connmark traffic that arrives at the host over eth0
- restore the mark when the traffic leaves a pod veth
- force marked traffic to use the main routing table so that it
  exits via eth0.

Configure eth0 RPF check for "loose" filtering to prevent
NodePort traffic from being blocked due to incorrect reverse
path lookup in the kernel.  (The kernel is unable to undo the
NAT as part of its RPF check so it calculates the incorrect
reverse route.)

Add diagnostics for env var configuration and sysctls.

Fixes #75
  • Loading branch information
fasaxc committed Aug 2, 2018
1 parent 8a454e3 commit 2cce7de
Show file tree
Hide file tree
Showing 4 changed files with 411 additions and 45 deletions.
18 changes: 16 additions & 2 deletions ipamd/introspect.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
log "github.com/cihub/seelog"
"github.com/prometheus/client_golang/prometheus/promhttp"

"github.com/aws/amazon-vpc-cni-k8s/pkg/networkutils"
"github.com/aws/amazon-vpc-cni-k8s/pkg/utils"
)

Expand Down Expand Up @@ -68,8 +69,9 @@ func (c *IPAMContext) SetupHTTP() {

func (c *IPAMContext) setupServer() *http.Server {
serverFunctions := map[string]func(w http.ResponseWriter, r *http.Request){
"/v1/enis": eniV1RequestHandler(c),
"/v1/pods": podV1RequestHandler(c),
"/v1/enis": eniV1RequestHandler(c),
"/v1/pods": podV1RequestHandler(c),
"/v1/env-settings": envV1RequestHandler(c),
}
paths := make([]string, 0, len(serverFunctions))
for path := range serverFunctions {
Expand Down Expand Up @@ -132,6 +134,18 @@ func podV1RequestHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Requ
}
}

func envV1RequestHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Request) {
return func(w http.ResponseWriter, r *http.Request) {
responseJSON, err := json.Marshal(networkutils.GetConfigForDebug())
if err != nil {
log.Error("Failed to marshal env var data: %v", err)
http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError)
return
}
w.Write(responseJSON)
}
}

func metricsHandler(ipam *IPAMContext) func(http.ResponseWriter, *http.Request) {
return func(w http.ResponseWriter, r *http.Request) {
promhttp.Handler()
Expand Down
254 changes: 217 additions & 37 deletions pkg/networkutils/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
package networkutils

import (
"fmt"
"io"
"math"
"net"
"os"
"strconv"
Expand Down Expand Up @@ -47,8 +50,26 @@ const (

// This environment is used to specify whether an external NAT gateway will be used to provide SNAT of
// secondary ENI IP addresses. If set to "true", the SNAT iptables rule and off-VPC ip rule will not
// be installed and will be removed if they are already installed.
// be installed and will be removed if they are already installed. Defaults to false.
envExternalSNAT = "AWS_VPC_K8S_CNI_EXTERNALSNAT"

// envNodePortSupport is the name of environment variable that configures whether we implement support for
// NodePorts on the primary ENI. This requires that we add additional iptables rules and loosen the kernel's
// RPF check as described below. Defaults to true.
envNodePortSupport = "AWS_VPC_CNI_NODE_PORT_SUPPORT"

// envConnmark is the name of the environment variable that overrides the default connection mark, used to
// mark traffic coming from the primary ENI so that return traffic can be forced out of the same interface.
// Without using a mark, NodePort DNAT and our source-based routing do not work together if the target pod
// behind the node port is not on the main ENI. In that case, the un-DNAT is done after the source-based
// routing, resulting in the packet being sent out of the pod's ENI, when the NodePort traffic should be
// sent over the main ENI.
envConnmark = "AWS_VPC_K8S_CNI_CONNMARK"

// defaultConnmark is the default value for the connmark described above. Note: the mark space is a little crowded,
// - kube-proxy uses 0x0000c000
// - Calico uses 0xffff0000.
defaultConnmark = 0x80
)

// NetworkAPIs defines the host level and the eni level network related operations
Expand All @@ -60,14 +81,45 @@ type NetworkAPIs interface {
}

type linuxNetwork struct {
netLink netlinkwrapper.NetLink
ns nswrapper.NS
useExternalSNAT bool
nodePortSupportEnabled bool
connmark uint32

netLink netlinkwrapper.NetLink
ns nswrapper.NS
newIptables func() (iptablesIface, error)
mainENIMark uint32
openFile func(name string, flag int, perm os.FileMode) (stringWriteCloser, error)
}

type iptablesIface interface {
Exists(table, chain string, rulespec ...string) (bool, error)
Append(table, chain string, rulespec ...string) error
Delete(table, chain string, rulespec ...string) error
}

// New creates a linuxNetwork object
func New() NetworkAPIs {
return &linuxNetwork{netLink: netlinkwrapper.NewNetLink(),
ns: nswrapper.NewNS()}
return &linuxNetwork{
useExternalSNAT: useExternalSNAT(),
nodePortSupportEnabled: nodePortSupportEnabled(),
mainENIMark: getConnmark(),

netLink: netlinkwrapper.NewNetLink(),
ns: nswrapper.NewNS(),
newIptables: func() (iptablesIface, error) {
ipt, err := iptables.New()
return ipt, err
},
openFile: func(name string, flag int, perm os.FileMode) (stringWriteCloser, error) {
return os.OpenFile(name, flag, perm)
},
}
}

type stringWriteCloser interface {
io.Closer
WriteString(s string) (int, error)
}

func isDuplicateRuleAdd(err error) bool {
Expand All @@ -76,85 +128,213 @@ func isDuplicateRuleAdd(err error) bool {

// SetupHostNetwork performs node level network configuration
// TODO : implement ip rule not to 10.0.0.0/16(vpc'subnet) table main priority 1024
func (os *linuxNetwork) SetupHostNetwork(vpcCIDR *net.IPNet, primaryAddr *net.IP) error {

externalSNAT := useExternalSNAT()
hostRule := os.netLink.NewRule()
func (n *linuxNetwork) SetupHostNetwork(vpcCIDR *net.IPNet, primaryAddr *net.IP) error {
log.Info("Setting up host network")
hostRule := n.netLink.NewRule()
hostRule.Dst = vpcCIDR
hostRule.Table = mainRoutingTable
hostRule.Priority = hostRulePriority
hostRule.Invert = true

// If this is a restart, cleanup previous rule first
err := os.netLink.RuleDel(hostRule)
err := n.netLink.RuleDel(hostRule)
if err != nil && !containsNoSuchRule(err) {
log.Errorf("Failed to cleanup old host IP rule: %v", err)
return errors.Wrapf(err, "host network setup: failed to delete old host rule")
}

// Only include the rule if SNAT is not being handled by an external NAT gateway and needs to be
// handled on-node.
if !externalSNAT {
err = os.netLink.RuleAdd(hostRule)
if !n.useExternalSNAT {
err = n.netLink.RuleAdd(hostRule)
if err != nil {
log.Errorf("Failed to add host IP rule: %v", err)
return errors.Wrapf(err, "host network setup: failed to add host rule")
}
}

ipt, err := iptables.New()
if n.nodePortSupportEnabled {
// If node port support is enabled, configure the kernel's reverse path filter check on eth0 for "loose"
// filtering. This is required because
// - NodePorts are exposed on eth0
// - The kernel's RPF check happens after incoming packets to NodePorts are DNATted to the pod IP.
// - For pods assigned to secondary ENIs, the routing table includes source-based routing. When the kernel does
// the RPF check, it looks up the route using the pod IP as the source.
// - Thus, it finds the source-based route that leaves via the secondary ENI.
// - In "strict" mode, the RPF check fails because the return path uses a different interface to the incoming
// packet. In "loose" mode, the check passes because some route was found.
const eth0RPFilter = "/proc/sys/net/ipv4/conf/eth0/rp_filter"
const rpFilterLoose = "2"
err := n.setProcSys(eth0RPFilter, rpFilterLoose)
if err != nil {
return errors.Wrapf(err, "failed to configure eth0 RPF check")
}
}

if err != nil {
return errors.Wrap(err, "host network setup: failed to create iptables")
// If node port support is enabled, add a rule that will force force marked traffic out of the main ENI. We then
// add iptables rules below that will mark traffic that needs this special treatment. In particular NodePort
// traffic always comes in via the main ENI but response traffic would go out of the pod's assigned ENI if we
// didn't handle it specially. This is because the routing decision is done before the NodePort's DNAT is
// reversed so, to the routing table, it looks like the traffic is pod traffic instead of NodePort traffic.
mainENIRule := n.netLink.NewRule()
mainENIRule.Mark = int(n.mainENIMark)
mainENIRule.Mask = int(n.mainENIMark)
mainENIRule.Table = mainRoutingTable
mainENIRule.Priority = hostRulePriority
// If this is a restart, cleanup previous rule first
err = n.netLink.RuleDel(mainENIRule)
if err != nil && !containsNoSuchRule(err) {
log.Errorf("Failed to cleanup old main ENI rule: %v", err)
return errors.Wrapf(err, "host network setup: failed to delete old main ENI rule")
}

natCmd := []string{"!", "-d", vpcCIDR.String(), "-m", "comment", "--comment", "AWS, SNAT",
"-m", "addrtype", "!", "--dst-type", "LOCAL", "-j", "SNAT", "--to-source", primaryAddr.String()}
exists, err := ipt.Exists("nat", "POSTROUTING", natCmd...)
if n.nodePortSupportEnabled {
err = n.netLink.RuleAdd(mainENIRule)
if err != nil {
log.Errorf("Failed to add host main ENI rule: %v", err)
return errors.Wrapf(err, "host network setup: failed to add main ENI rule")
}
}

ipt, err := n.newIptables()

if err != nil {
return errors.Wrapf(err, "host network setup: failed to add POSTROUTING rule for primary address %s", primaryAddr)
return errors.Wrap(err, "host network setup: failed to create iptables")
}

if !exists && !externalSNAT {
// We are handling SNAT on-node, so include the iptables SNAT POSTROUTING rule.
err = ipt.Append("nat", "POSTROUTING", natCmd...)

for _, rule := range []iptablesRule{
{
name: "connmark for primary ENI",
shouldExist: n.nodePortSupportEnabled,
table: "mangle",
chain: "PREROUTING",
rule: []string{
"-m", "comment", "--comment", "AWS, primary ENI",
"-i", "eth0",
"-m", "addrtype", "--dst-type", "LOCAL", "--limit-iface-in",
"-j", "CONNMARK", "--set-mark", fmt.Sprintf("%#x/%#x", n.mainENIMark, n.mainENIMark),
},
},
{
name: "connmark restore for primary ENI",
shouldExist: n.nodePortSupportEnabled,
table: "mangle",
chain: "PREROUTING",
rule: []string{
"-m", "comment", "--comment", "AWS, primary ENI",
"-i", "eni+", "-j", "CONNMARK", "--restore-mark", "--mask", fmt.Sprintf("%#x", n.mainENIMark),
},
},
{
name: fmt.Sprintf("rule for primary address %s", primaryAddr),
shouldExist: !n.useExternalSNAT,
table: "nat",
chain: "POSTROUTING",
rule: []string{
"!", "-d", vpcCIDR.String(),
"-m", "comment", "--comment", "AWS, SNAT",
"-m", "addrtype", "!", "--dst-type", "LOCAL",
"-j", "SNAT", "--to-source", primaryAddr.String()},
},
} {
exists, err := ipt.Exists(rule.table, rule.chain, rule.rule...)
if err != nil {
return errors.Wrapf(err, "host network setup: failed to append POSTROUTING rule for primary address %s", primaryAddr)
return errors.Wrapf(err, "host network setup: failed to check existence of %v", rule)
}
} else if exists && externalSNAT {
// We are not handling SNAT on-node, so delete the existing iptables SNAT POSTROUTING rule.
err = ipt.Delete("nat", "POSTROUTING", natCmd...)

if err != nil {
return errors.Wrapf(err, "host network setup: failed to delete POSTROUTING rule for primary address %s", primaryAddr)
if !exists && rule.shouldExist {
err = ipt.Append(rule.table, rule.chain, rule.rule...)
if err != nil {
return errors.Wrapf(err, "host network setup: failed to add %v", rule)
}
} else if exists && !rule.shouldExist {
err = ipt.Delete(rule.table, rule.chain, rule.rule...)
if err != nil {
return errors.Wrapf(err, "host network setup: failed to delete %v", rule)
}
}
}

return nil
}

func (n *linuxNetwork) setProcSys(key, value string) error {
f, err := n.openFile(key, os.O_WRONLY, 0644)
if err != nil {
return err
}
defer f.Close()
_, err = f.WriteString(value)
if err != nil {
return err
}
return nil
}

type iptablesRule struct {
name string
shouldExist bool
table, chain string
rule []string
}

func (r iptablesRule) String() string {
return fmt.Sprintf("%s/%s rule %s", r.table, r.chain, r.name)
}

func containsNoSuchRule(err error) bool {
if errno, ok := err.(syscall.Errno); ok {
return errno == syscall.ENOENT
}
return false
}

// GetConfigForDebug returns the active values of the configuration env vars (for debugging purposes).
func GetConfigForDebug() map[string]interface{} {
return map[string]interface{}{
envExternalSNAT: useExternalSNAT(),
envNodePortSupport: nodePortSupportEnabled(),
envConnmark: getConnmark(),
}
}

// useExternalSNAT returns whether SNAT of secondary ENI IPs should be handled with an external
// NAT gateway rather than on node. Failure to parse the setting will result in a log and the
// setting will be disabled.
func useExternalSNAT() bool {
if externalSNATStr := os.Getenv(envExternalSNAT); externalSNATStr != "" {
externalSNAT, err := strconv.ParseBool(externalSNATStr)
return getBoolEnvVar(envExternalSNAT, false)
}

func nodePortSupportEnabled() bool {
return getBoolEnvVar(envNodePortSupport, true)
}

func getBoolEnvVar(name string, defaultValue bool) bool {
if strValue := os.Getenv(name); strValue != "" {
parsedValue, err := strconv.ParseBool(strValue)
if err != nil {
log.Error("Failed to parse "+envExternalSNAT, err.Error())
return false
log.Error("Failed to parse "+name+"; using default: "+fmt.Sprint(defaultValue), err.Error())
return defaultValue
}
return externalSNAT
return parsedValue
}
return false
return defaultValue
}

func getConnmark() uint32 {
if connmark := os.Getenv(envConnmark); connmark != "" {
mark, err := strconv.ParseInt(connmark, 0, 64)
if err != nil {
log.Error("Failed to parse "+envConnmark+"; will use ", defaultConnmark, err.Error())
return defaultConnmark
}
if mark > math.MaxUint32 || mark <= 0 {
log.Error(""+envConnmark+" out of range; will use ", defaultConnmark)
return defaultConnmark
}
return uint32(mark)
}
return defaultConnmark
}

// LinkByMac returns linux netlink based on interface MAC
Expand All @@ -177,8 +357,8 @@ func LinkByMac(mac string, netLink netlinkwrapper.NetLink) (netlink.Link, error)
}

// SetupENINetwork adds default route to route table (eni-<eni_table>)
func (os *linuxNetwork) SetupENINetwork(eniIP string, eniMAC string, eniTable int, eniSubnetCIDR string) error {
return setupENINetwork(eniIP, eniMAC, eniTable, eniSubnetCIDR, os.netLink)
func (n *linuxNetwork) SetupENINetwork(eniIP string, eniMAC string, eniTable int, eniSubnetCIDR string) error {
return setupENINetwork(eniIP, eniMAC, eniTable, eniSubnetCIDR, n.netLink)
}

func setupENINetwork(eniIP string, eniMAC string, eniTable int, eniSubnetCIDR string, netLink netlinkwrapper.NetLink) error {
Expand Down
Loading

0 comments on commit 2cce7de

Please sign in to comment.