1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
|
// SPDX-License-Identifier: GPL-2.0-only
/* Highlights:
* 1. The major difference between this bpf program and tcp_cubic.c
* is that this bpf program relies on `cong_control` rather than
* `cong_avoid` in the struct tcp_congestion_ops.
* 2. Logic such as tcp_cwnd_reduction, tcp_cong_avoid, and
* tcp_update_pacing_rate is bypassed when `cong_control` is
* defined, so moving these logic to `cong_control`.
* 3. WARNING: This bpf program is NOT the same as tcp_cubic.c.
* The main purpose is to show use cases of the arguments in
* `cong_control`. For simplicity's sake, it reuses tcp cubic's
* kernel functions.
*/
#include "bpf_tracing_net.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#define USEC_PER_SEC 1000000UL
#define TCP_PACING_SS_RATIO (200)
#define TCP_PACING_CA_RATIO (120)
#define TCP_REORDERING (12)
#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))
#define after(seq2, seq1) before(seq1, seq2)
extern void cubictcp_init(struct sock *sk) __ksym;
extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym;
extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym;
extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym;
extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym;
extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym;
static bool before(__u32 seq1, __u32 seq2)
{
return (__s32)(seq1-seq2) < 0;
}
static __u64 div64_u64(__u64 dividend, __u64 divisor)
{
return dividend / divisor;
}
static void tcp_update_pacing_rate(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
__u64 rate;
/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
/* current rate is (cwnd * mss) / srtt
* In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
* In Congestion Avoidance phase, set it to 120 % the current rate.
*
* [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
* end of slow start and should slow down.
*/
if (tp->snd_cwnd < tp->snd_ssthresh / 2)
rate *= TCP_PACING_SS_RATIO;
else
rate *= TCP_PACING_CA_RATIO;
rate *= max(tp->snd_cwnd, tp->packets_out);
if (tp->srtt_us)
rate = div64_u64(rate, (__u64)tp->srtt_us);
sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate);
}
static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
int newly_lost, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
__u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out;
int delta = tp->snd_ssthresh - pkts_in_flight;
if (newly_acked_sacked <= 0 || !tp->prior_cwnd)
return;
__u32 prr_delivered = tp->prr_delivered + newly_acked_sacked;
if (delta < 0) {
__u64 dividend =
(__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1;
sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out;
} else {
sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked);
if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
sndcnt++;
sndcnt = min(delta, sndcnt);
}
/* Force a fast retransmit upon entering fast recovery */
sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
tp->snd_cwnd = pkts_in_flight + sndcnt;
}
/* Decide wheather to run the increase function of congestion control. */
static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
if (tcp_sk(sk)->reordering > TCP_REORDERING)
return flag & FLAG_FORWARD_PROGRESS;
return flag & FLAG_DATA_ACKED;
}
SEC("struct_ops")
void BPF_PROG(bpf_cubic_init, struct sock *sk)
{
cubictcp_init(sk);
}
SEC("struct_ops")
void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
{
cubictcp_cwnd_event(sk, event);
}
SEC("struct_ops")
void BPF_PROG(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag,
const struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
if (((1<<TCP_CA_CWR) | (1<<TCP_CA_Recovery)) &
(1 << inet_csk(sk)->icsk_ca_state)) {
/* Reduce cwnd if state mandates */
tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag);
if (!before(tp->snd_una, tp->high_seq)) {
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
tp->snd_cwnd = tp->snd_ssthresh;
tp->snd_cwnd_stamp = tcp_jiffies32;
}
}
} else if (tcp_may_raise_cwnd(sk, flag)) {
/* Advance cwnd if state allows */
cubictcp_cong_avoid(sk, ack, rs->acked_sacked);
tp->snd_cwnd_stamp = tcp_jiffies32;
}
tcp_update_pacing_rate(sk);
}
SEC("struct_ops")
__u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk)
{
return cubictcp_recalc_ssthresh(sk);
}
SEC("struct_ops")
void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state)
{
cubictcp_state(sk, new_state);
}
SEC("struct_ops")
void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample)
{
cubictcp_acked(sk, sample);
}
SEC("struct_ops")
__u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk)
{
return tcp_reno_undo_cwnd(sk);
}
SEC(".struct_ops")
struct tcp_congestion_ops cc_cubic = {
.init = (void *)bpf_cubic_init,
.ssthresh = (void *)bpf_cubic_recalc_ssthresh,
.cong_control = (void *)bpf_cubic_cong_control,
.set_state = (void *)bpf_cubic_state,
.undo_cwnd = (void *)bpf_cubic_undo_cwnd,
.cwnd_event = (void *)bpf_cubic_cwnd_event,
.pkts_acked = (void *)bpf_cubic_acked,
.name = "bpf_cc_cubic",
};
char _license[] SEC("license") = "GPL";
|