Skip to content
This repository has been archived by the owner on Apr 18, 2024. It is now read-only.

Commit

Permalink
mptcp: correctly ensure to not overfill subflows
Browse files Browse the repository at this point in the history
commit 988ec13 ("mptcp: Make sure that we don't overfill subflows")
identified the problem that due to TSQ, some unsent segments might
remain in the write_queue for a while, causing the scheduler to overfill
a subflow.

The original fix tried to calculate available space in terms of bytes,
but this is flawed because TCP in the end only counts in segments when
deciding to actually send the queued skb. This can result in large
amounts of small ( < mss ) segments, which is bad for the performance
(both bandwidth and cpu load).

Instead, count in segments, but take the unsent skb in the write_queue
into account.

Fixes: 988ec13 ("mptcp: Make sure that we don't overfill subflows")
Signed-off-by: Tim Froidcoeur <tim.froidcoeur@tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
(cherry picked from commit 15e8689)
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
  • Loading branch information
TimFroidcoeur authored and matttbe committed Jun 30, 2021
1 parent 85aa7a9 commit 697185f
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 36 deletions.
1 change: 1 addition & 0 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib);
void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb);
int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask);
u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now);
unsigned int tcp_mss_split_point(const struct sock *sk,
const struct sk_buff *skb,
unsigned int mss_now,
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/tcp_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -1755,7 +1755,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
/* Return the number of segments we want in the skb we are transmitting.
* See if congestion control module wants to decide; otherwise, autosize.
*/
static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
{
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
u32 min_tso, tso_segs;
Expand Down
82 changes: 47 additions & 35 deletions net/mptcp/mptcp_sched.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */

#include <linux/bug.h>
#include <linux/module.h>
#include <net/mptcp.h>
#include <trace/events/tcp.h>
Expand Down Expand Up @@ -37,12 +38,38 @@ bool mptcp_is_def_unavailable(struct sock *sk)
}
EXPORT_SYMBOL_GPL(mptcp_is_def_unavailable);

/* estimate number of segments currently in flight + unsent in
* the subflow socket.
*/
static int mptcp_subflow_queued(struct sock *sk, u32 max_tso_segs)
{
const struct tcp_sock *tp = tcp_sk(sk);
unsigned int queued;

/* estimate the max number of segments in the write queue
* this is an overestimation, avoiding to iterate over the queue
* to make a better estimation.
* Having only one skb in the queue however might trigger tso deferral,
* delaying the sending of a tso segment in the hope that skb_entail
* will append more data to the skb soon.
* Therefore, in the case only one skb is in the queue, we choose to
* potentially underestimate, risking to schedule one skb too many onto
* the subflow rather than not enough.
*/
if (sk->sk_write_queue.qlen > 1)
queued = sk->sk_write_queue.qlen * max_tso_segs;
else
queued = sk->sk_write_queue.qlen;

return queued + tcp_packets_in_flight(tp);
}

static bool mptcp_is_temp_unavailable(struct sock *sk,
const struct sk_buff *skb,
bool zero_wnd_test)
{
const struct tcp_sock *tp = tcp_sk(sk);
unsigned int mss_now, space, in_flight;
unsigned int mss_now;

if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
/* If SACK is disabled, and we got a loss, TCP does not exit
Expand All @@ -66,19 +93,11 @@ static bool mptcp_is_temp_unavailable(struct sock *sk,
return true;
}

in_flight = tcp_packets_in_flight(tp);
/* Not even a single spot in the cwnd */
if (in_flight >= tp->snd_cwnd)
return true;

mss_now = tcp_current_mss(sk);

/* Now, check if what is queued in the subflow's send-queue
* already fills the cwnd.
*/
space = (tp->snd_cwnd - in_flight) * mss_now;

if (tp->write_seq - tp->snd_nxt >= space)
/* Not even a single spot in the cwnd */
if (mptcp_subflow_queued(sk, tcp_tso_segs(sk, tcp_current_mss(sk)))
>= tp->snd_cwnd)
return true;

if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
Expand Down Expand Up @@ -398,11 +417,10 @@ static struct sk_buff *mptcp_next_segment(struct sock *meta_sk,
unsigned int *limit)
{
struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject);
unsigned int mss_now, in_flight_space;
int remaining_in_flight_space;
u32 max_len, max_segs, window;
unsigned int mss_now;
u32 max_len, gso_max_segs, max_segs, max_tso_segs, window;
struct tcp_sock *subtp;
u16 gso_max_segs;
int queued;

/* As we set it, we have to reset it as well. */
*limit = 0;
Expand Down Expand Up @@ -440,35 +458,29 @@ static struct sk_buff *mptcp_next_segment(struct sock *meta_sk,
if (skb->len <= mss_now)
return skb;

/* The following is similar to tcp_mss_split_point, but
* we do not care about nagle, because we will anyways
* use TCP_NAGLE_PUSH, which overrides this.
max_tso_segs = tcp_tso_segs(*subsk, tcp_current_mss(*subsk));
queued = mptcp_subflow_queued(*subsk, max_tso_segs);

/* this condition should already have been established in
* mptcp_is_temp_unavailable when selecting available flows
*/
WARN_ONCE(subtp->snd_cwnd <= queued, "Selected subflow no cwnd room");

gso_max_segs = (*subsk)->sk_gso_max_segs;
if (!gso_max_segs) /* No gso supported on the subflow's NIC */
gso_max_segs = 1;
max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs);

max_segs = min_t(unsigned int, subtp->snd_cwnd - queued, gso_max_segs);
if (!max_segs)
return NULL;

/* max_len is what would fit in the cwnd (respecting the 2GSO-limit of
* tcp_cwnd_test), but ignoring whatever was already queued.
/* if there is room for a segment, schedule up to a complete TSO
* segment to avoid TSO splitting. Even if it is more than allowed by
* the congestion window.
*/
max_len = min(mss_now * max_segs, skb->len);

in_flight_space = (subtp->snd_cwnd - tcp_packets_in_flight(subtp)) * mss_now;
remaining_in_flight_space = (int)in_flight_space - (subtp->write_seq - subtp->snd_nxt);
max_segs = max_t(unsigned int, max_tso_segs, max_segs);

if (remaining_in_flight_space <= 0)
WARN_ONCE(1, "in_flight %u cwnd %u wseq %u snxt %u mss_now %u cache %u",
tcp_packets_in_flight(subtp), subtp->snd_cwnd,
subtp->write_seq, subtp->snd_nxt, mss_now, subtp->mss_cache);
else
/* max_len now fits exactly in the write-queue, taking into
* account what was already queued.
*/
max_len = min_t(u32, max_len, remaining_in_flight_space);
max_len = min(mss_now * max_segs, skb->len);

window = tcp_wnd_end(subtp) - subtp->write_seq;

Expand Down

0 comments on commit 697185f

Please sign in to comment.