Skip to content

Commit

Permalink
Merge tag 'nf-24-09-26' of git://git.kernel.org/pub/scm/linux/kernel/…
Browse files Browse the repository at this point in the history
…git/netfilter/nf

Pablo Neira Ayuso says:

====================
Netfilter fixes for net

v2: with kdoc fixes per Paolo Abeni.

The following patchset contains Netfilter fixes for net:

Patch openbmc#1 and openbmc#2 handle an esoteric scenario: Given two tasks sending UDP
packets to one another, two packets of the same flow in each direction
handled by different CPUs that result in two conntrack objects in NEW
state, where reply packet loses race. Then, patch openbmc#3 adds a testcase for
this scenario. Series from Florian Westphal.

1) NAT engine can falsely detect a port collision if it happens to pick
   up a reply packet as NEW rather than ESTABLISHED. Add extra code to
   detect this and suppress port reallocation in this case.

2) To complete the clash resolution in the reply direction, extend conntrack
   logic to detect clashing conntrack in the reply direction to existing entry.

3) Adds a test case.

Then, an assorted list of fixes follow:

4) Add a selftest for tproxy, from Antonio Ojea.

5) Guard ctnetlink_*_size() functions under
   #if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS)
   From Andy Shevchenko.

6) Use -m socket --transparent in iptables tproxy documentation.
   From XIE Zhibang.

7) Call kfree_rcu() when releasing flowtable hooks to address race with
   netlink dump path, from Phil Sutter.

8) Fix compilation warning in nf_reject with CONFIG_BRIDGE_NETFILTER=n.
   From Simon Horman.

9) Guard ctnetlink_label_size() under CONFIG_NF_CONNTRACK_EVENTS which
   is its only user, to address a compilation warning. From Simon Horman.

10) Use rcu-protected list iteration over basechain hooks from netlink
    dump path.

11) Fix memcg for nf_tables, use GFP_KERNEL_ACCOUNT is not complete.

12) Remove old nfqueue conntrack clash resolution. Instead trying to
    use same destination address consistently which requires double DNAT,
    use the existing clash resolution which allows clashing packets
    go through with different destination. Antonio Ojea originally
    reported an issue from the postrouting chain, I proposed a fix:
    https://lore.kernel.org/netfilter-devel/ZuwSwAqKgCB2a51-@calendula/T/
    which he reported it did not work for him.

13) Adds a selftest for patch 12.

14) Fixes ipvs.sh selftest.

netfilter pull request 24-09-26

* tag 'nf-24-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf:
  selftests: netfilter: Avoid hanging ipvs.sh
  kselftest: add test for nfqueue induced conntrack race
  netfilter: nfnetlink_queue: remove old clash resolution logic
  netfilter: nf_tables: missing objects with no memcg accounting
  netfilter: nf_tables: use rcu chain hook list iterator from netlink dump path
  netfilter: ctnetlink: compile ctnetlink_label_size with CONFIG_NF_CONNTRACK_EVENTS
  netfilter: nf_reject: Fix build warning when CONFIG_BRIDGE_NETFILTER=n
  netfilter: nf_tables: Keep deleted flowtable hooks until after RCU
  docs: tproxy: ignore non-transparent sockets in iptables
  netfilter: ctnetlink: Guard possible unused functions
  selftests: netfilter: nft_tproxy.sh: add tcp tests
  selftests: netfilter: add reverse-clash resolution test case
  netfilter: conntrack: add clash resolution for reverse collisions
  netfilter: nf_nat: don't try nat source port reallocation for reverse dir clash
====================

Link: https://patch.msgid.link/20240926110717.102194-1-pablo@netfilter.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
  • Loading branch information
Paolo Abeni committed Sep 26, 2024
2 parents 72ef075 + fc78630 commit aef3a58
Show file tree
Hide file tree
Showing 22 changed files with 1,091 additions and 132 deletions.
2 changes: 1 addition & 1 deletion Documentation/networking/tproxy.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ The idea is that you identify packets with destination address matching a local
socket on your box, set the packet mark to a certain value::

# iptables -t mangle -N DIVERT
# iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT
# iptables -t mangle -A PREROUTING -p tcp -m socket --transparent -j DIVERT
# iptables -t mangle -A DIVERT -j MARK --set-mark 1
# iptables -t mangle -A DIVERT -j ACCEPT

Expand Down
4 changes: 0 additions & 4 deletions include/linux/netfilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,15 +376,11 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
struct nf_conn;
enum nf_nat_manip_type;
struct nlattr;
enum ip_conntrack_dir;

struct nf_nat_hook {
int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip,
const struct nlattr *attr);
void (*decode_session)(struct sk_buff *skb, struct flowi *fl);
unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct,
enum nf_nat_manip_type mtype,
enum ip_conntrack_dir dir);
void (*remove_nat_bysrc)(struct nf_conn *ct);
};

Expand Down
10 changes: 4 additions & 6 deletions net/ipv4/netfilter/nf_reject_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -239,9 +239,8 @@ static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
int hook)
{
struct sk_buff *nskb;
struct iphdr *niph;
const struct tcphdr *oth;
struct sk_buff *nskb;
struct tcphdr _oth;

oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
Expand All @@ -266,14 +265,12 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
nskb->mark = IP4_REPLY_MARK(net, oldskb->mark);

skb_reserve(nskb, LL_MAX_HEADER);
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
ip4_dst_hoplimit(skb_dst(nskb)));
nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
ip4_dst_hoplimit(skb_dst(nskb)));
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC))
goto free_nskb;

niph = ip_hdr(nskb);

/* "Never happens" */
if (nskb->len > dst_mtu(skb_dst(nskb)))
goto free_nskb;
Expand All @@ -290,6 +287,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
*/
if (nf_bridge_info_exists(oldskb)) {
struct ethhdr *oeth = eth_hdr(oldskb);
struct iphdr *niph = ip_hdr(nskb);
struct net_device *br_indev;

br_indev = nf_bridge_get_physindev(oldskb, net);
Expand Down
5 changes: 2 additions & 3 deletions net/ipv6/netfilter/nf_reject_ipv6.c
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,6 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
const struct tcphdr *otcph;
unsigned int otcplen, hh_len;
const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
struct ipv6hdr *ip6h;
struct dst_entry *dst = NULL;
struct flowi6 fl6;

Expand Down Expand Up @@ -329,8 +328,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
nskb->mark = fl6.flowi6_mark;

skb_reserve(nskb, hh_len + dst->header_len);
ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
ip6_dst_hoplimit(dst));
nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst));
nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen);

nf_ct_attach(nskb, oldskb);
Expand All @@ -345,6 +343,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
*/
if (nf_bridge_info_exists(oldskb)) {
struct ethhdr *oeth = eth_hdr(oldskb);
struct ipv6hdr *ip6h = ipv6_hdr(nskb);
struct net_device *br_indev;

br_indev = nf_bridge_get_physindev(oldskb, net);
Expand Down
141 changes: 51 additions & 90 deletions net/netfilter/nf_conntrack_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,56 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
tstamp->start = ktime_get_real_ns();
}

/**
* nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow
* @ct1: conntrack in hash table to check against
* @ct2: merge candidate
*
* returns true if ct1 and ct2 happen to refer to the same flow, but
* in opposing directions, i.e.
* ct1: a:b -> c:d
* ct2: c:d -> a:b
* for both directions. If so, @ct2 should not have been created
* as the skb should have been picked up as ESTABLISHED flow.
* But ct1 was not yet committed to hash table before skb that created
* ct2 had arrived.
*
* Note we don't compare netns because ct entries in different net
* namespace cannot clash to begin with.
*
* @return: true if ct1 and ct2 are identical when swapping origin/reply.
*/
static bool
nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2)
{
u16 id1, id2;

if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
&ct2->tuplehash[IP_CT_DIR_REPLY].tuple))
return false;

if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
&ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple))
return false;

id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL);
id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY);
if (id1 != id2)
return false;

id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY);
id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL);

return id1 == id2;
}

static int nf_ct_can_merge(const struct nf_conn *ct,
const struct nf_conn *loser_ct)
{
return nf_ct_match(ct, loser_ct) ||
nf_ct_match_reverse(ct, loser_ct);
}

/* caller must hold locks to prevent concurrent changes */
static int __nf_ct_resolve_clash(struct sk_buff *skb,
struct nf_conntrack_tuple_hash *h)
Expand All @@ -999,11 +1049,7 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb,

loser_ct = nf_ct_get(skb, &ctinfo);

if (nf_ct_is_dying(ct))
return NF_DROP;

if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
nf_ct_match(ct, loser_ct)) {
if (nf_ct_can_merge(ct, loser_ct)) {
struct net *net = nf_ct_net(ct);

nf_conntrack_get(&ct->ct_general);
Expand Down Expand Up @@ -2151,80 +2197,6 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
nf_conntrack_get(skb_nfct(nskb));
}

static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo)
{
const struct nf_nat_hook *nat_hook;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
unsigned int status;
int dataoff;
u16 l3num;
u8 l4num;

l3num = nf_ct_l3num(ct);

dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
if (dataoff <= 0)
return NF_DROP;

if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
l4num, net, &tuple))
return NF_DROP;

if (ct->status & IPS_SRC_NAT) {
memcpy(tuple.src.u3.all,
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
sizeof(tuple.src.u3.all));
tuple.src.u.all =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
}

if (ct->status & IPS_DST_NAT) {
memcpy(tuple.dst.u3.all,
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
sizeof(tuple.dst.u3.all));
tuple.dst.u.all =
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
}

h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
if (!h)
return NF_ACCEPT;

/* Store status bits of the conntrack that is clashing to re-do NAT
* mangling according to what it has been done already to this packet.
*/
status = ct->status;

nf_ct_put(ct);
ct = nf_ct_tuplehash_to_ctrack(h);
nf_ct_set(skb, ct, ctinfo);

nat_hook = rcu_dereference(nf_nat_hook);
if (!nat_hook)
return NF_ACCEPT;

if (status & IPS_SRC_NAT) {
unsigned int verdict = nat_hook->manip_pkt(skb, ct,
NF_NAT_MANIP_SRC,
IP_CT_DIR_ORIGINAL);
if (verdict != NF_ACCEPT)
return verdict;
}

if (status & IPS_DST_NAT) {
unsigned int verdict = nat_hook->manip_pkt(skb, ct,
NF_NAT_MANIP_DST,
IP_CT_DIR_ORIGINAL);
if (verdict != NF_ACCEPT)
return verdict;
}

return NF_ACCEPT;
}

/* This packet is coming from userspace via nf_queue, complete the packet
* processing after the helper invocation in nf_confirm().
*/
Expand Down Expand Up @@ -2288,17 +2260,6 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
if (!ct)
return NF_ACCEPT;

if (!nf_ct_is_confirmed(ct)) {
int ret = __nf_conntrack_update(net, skb, ct, ctinfo);

if (ret != NF_ACCEPT)
return ret;

ct = nf_ct_get(skb, &ctinfo);
if (!ct)
return NF_ACCEPT;
}

return nf_confirm_cthelper(skb, ct, ctinfo);
}

Expand Down
9 changes: 3 additions & 6 deletions net/netfilter/nf_conntrack_netlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
#define ctnetlink_dump_secctx(a, b) (0)
#endif

#ifdef CONFIG_NF_CONNTRACK_LABELS
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static inline int ctnetlink_label_size(const struct nf_conn *ct)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
Expand All @@ -391,6 +391,7 @@ static inline int ctnetlink_label_size(const struct nf_conn *ct)
return 0;
return nla_total_size(sizeof(labels->bits));
}
#endif

static int
ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)
Expand All @@ -411,10 +412,6 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct)

return 0;
}
#else
#define ctnetlink_dump_labels(a, b) (0)
#define ctnetlink_label_size(a) (0)
#endif

#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)

Expand Down Expand Up @@ -652,7 +649,6 @@ static size_t ctnetlink_proto_size(const struct nf_conn *ct)

return len + len4;
}
#endif

static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
{
Expand Down Expand Up @@ -690,6 +686,7 @@ static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct)
return 0;
#endif
}
#endif

#ifdef CONFIG_NF_CONNTRACK_EVENTS
static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
Expand Down
Loading

0 comments on commit aef3a58

Please sign in to comment.