File: nf_conntrack_ecache.c

package info (click to toggle)
linux 6.16.3-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 1,724,576 kB
  • sloc: ansic: 26,558,545; asm: 271,315; sh: 143,998; python: 72,469; makefile: 57,126; perl: 36,821; xml: 19,553; cpp: 5,820; yacc: 4,915; lex: 2,955; awk: 1,667; sed: 28; ruby: 25
file content (381 lines) | stat: -rw-r--r-- 9,304 bytes parent folder | download | duplicates (13)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
// SPDX-License-Identifier: GPL-2.0-only
/* Event cache for netfilter. */

/*
 * (C) 2005 Harald Welte <laforge@gnumonks.org>
 * (C) 2005 Patrick McHardy <kaber@trash.net>
 * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org>
 * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/types.h>
#include <linux/netfilter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/stddef.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/export.h>

#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_extend.h>

static DEFINE_MUTEX(nf_ct_ecache_mutex);

#define DYING_NULLS_VAL			((1 << 30) + 1)
#define ECACHE_MAX_JIFFIES		msecs_to_jiffies(10)
#define ECACHE_RETRY_JIFFIES		msecs_to_jiffies(10)

enum retry_state {
	STATE_CONGESTED,
	STATE_RESTART,
	STATE_DONE,
};

struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net)
{
	struct nf_conntrack_net *cnet = nf_ct_pernet(net);

	return &cnet->ecache;
}
#if IS_MODULE(CONFIG_NF_CT_NETLINK)
EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache);
#endif

static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet)
{
	unsigned long stop = jiffies + ECACHE_MAX_JIFFIES;
	struct hlist_nulls_head evicted_list;
	enum retry_state ret = STATE_DONE;
	struct nf_conntrack_tuple_hash *h;
	struct hlist_nulls_node *n;
	unsigned int sent;

	INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL);

next:
	sent = 0;
	spin_lock_bh(&cnet->ecache.dying_lock);

	hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) {
		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);

		/* The worker owns all entries, ct remains valid until nf_ct_put
		 * in the loop below.
		 */
		if (nf_conntrack_event(IPCT_DESTROY, ct)) {
			ret = STATE_CONGESTED;
			break;
		}

		hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
		hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list);

		if (time_after(stop, jiffies)) {
			ret = STATE_RESTART;
			break;
		}

		if (sent++ > 16) {
			spin_unlock_bh(&cnet->ecache.dying_lock);
			cond_resched();
			goto next;
		}
	}

	spin_unlock_bh(&cnet->ecache.dying_lock);

	hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) {
		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);

		hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
		nf_ct_put(ct);

		cond_resched();
	}

	return ret;
}

static void ecache_work(struct work_struct *work)
{
	struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work);
	int ret, delay = -1;

	ret = ecache_work_evict_list(cnet);
	switch (ret) {
	case STATE_CONGESTED:
		delay = ECACHE_RETRY_JIFFIES;
		break;
	case STATE_RESTART:
		delay = 0;
		break;
	case STATE_DONE:
		break;
	}

	if (delay >= 0)
		schedule_delayed_work(&cnet->ecache.dwork, delay);
}

static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
					   const u32 events,
					   const u32 missed,
					   const struct nf_ct_event *item)
{
	struct net *net = nf_ct_net(item->ct);
	struct nf_ct_event_notifier *notify;
	u32 old, want;
	int ret;

	if (!((events | missed) & e->ctmask))
		return 0;

	rcu_read_lock();

	notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
	if (!notify) {
		rcu_read_unlock();
		return 0;
	}

	ret = notify->ct_event(events | missed, item);
	rcu_read_unlock();

	if (likely(ret >= 0 && missed == 0))
		return 0;

	do {
		old = READ_ONCE(e->missed);
		if (ret < 0)
			want = old | events;
		else
			want = old & ~missed;
	} while (cmpxchg(&e->missed, old, want) != old);

	return ret;
}

static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e)
{
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
	if (local64_read(&e->timestamp))
		local64_set(&e->timestamp, ktime_get_real_ns());
#endif
}

int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
				  u32 portid, int report)
{
	struct nf_conntrack_ecache *e;
	struct nf_ct_event item;
	unsigned int missed;
	int ret;

	if (!nf_ct_is_confirmed(ct))
		return 0;

	e = nf_ct_ecache_find(ct);
	if (!e)
		return 0;

	memset(&item, 0, sizeof(item));

	item.ct = ct;
	item.portid = e->portid ? e->portid : portid;
	item.report = report;

	/* This is a resent of a destroy event? If so, skip missed */
	missed = e->portid ? 0 : e->missed;

	nf_ct_ecache_tstamp_refresh(e);

	ret = __nf_conntrack_eventmask_report(e, events, missed, &item);
	if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) {
		/* This is a destroy event that has been triggered by a process,
		 * we store the PORTID to include it in the retransmission.
		 */
		if (e->portid == 0 && portid != 0)
			e->portid = portid;
	}

	return ret;
}
EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report);

/* deliver cached events and clear cache entry - must be called with locally
 * disabled softirqs */
void nf_ct_deliver_cached_events(struct nf_conn *ct)
{
	struct nf_conntrack_ecache *e;
	struct nf_ct_event item;
	unsigned int events;

	if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct))
		return;

	e = nf_ct_ecache_find(ct);
	if (e == NULL)
		return;

	events = xchg(&e->cache, 0);

	item.ct = ct;
	item.portid = 0;
	item.report = 0;

	/* We make a copy of the missed event cache without taking
	 * the lock, thus we may send missed events twice. However,
	 * this does not harm and it happens very rarely.
	 */
	__nf_conntrack_eventmask_report(e, events, e->missed, &item);
}
EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);

void nf_ct_expect_event_report(enum ip_conntrack_expect_events event,
			       struct nf_conntrack_expect *exp,
			       u32 portid, int report)

{
	struct net *net = nf_ct_exp_net(exp);
	struct nf_ct_event_notifier *notify;
	struct nf_conntrack_ecache *e;

	rcu_read_lock();
	notify = rcu_dereference(net->ct.nf_conntrack_event_cb);
	if (!notify)
		goto out_unlock;

	e = nf_ct_ecache_find(exp->master);
	if (!e)
		goto out_unlock;

	if (e->expmask & (1 << event)) {
		struct nf_exp_event item = {
			.exp	= exp,
			.portid	= portid,
			.report = report
		};
		notify->exp_event(1 << event, &item);
	}
out_unlock:
	rcu_read_unlock();
}

void nf_conntrack_register_notifier(struct net *net,
				    const struct nf_ct_event_notifier *new)
{
	struct nf_ct_event_notifier *notify;

	mutex_lock(&nf_ct_ecache_mutex);
	notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb,
					   lockdep_is_held(&nf_ct_ecache_mutex));
	WARN_ON_ONCE(notify);
	rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new);
	mutex_unlock(&nf_ct_ecache_mutex);
}
EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier);

void nf_conntrack_unregister_notifier(struct net *net)
{
	mutex_lock(&nf_ct_ecache_mutex);
	RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL);
	mutex_unlock(&nf_ct_ecache_mutex);
	/* synchronize_rcu() is called after netns pre_exit */
}
EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier);

void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
{
	struct nf_conntrack_net *cnet = nf_ct_pernet(net);

	if (state == NFCT_ECACHE_DESTROY_FAIL &&
	    !delayed_work_pending(&cnet->ecache.dwork)) {
		schedule_delayed_work(&cnet->ecache.dwork, HZ);
		net->ct.ecache_dwork_pending = true;
	} else if (state == NFCT_ECACHE_DESTROY_SENT) {
		if (!hlist_nulls_empty(&cnet->ecache.dying_list))
			mod_delayed_work(system_wq, &cnet->ecache.dwork, 0);
		else
			net->ct.ecache_dwork_pending = false;
	}
}

static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e)
{
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
	u64 ts = 0;

	if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
		ts = ktime_get_real_ns();

	local64_set(&e->timestamp, ts);
#endif
}

bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
{
	struct net *net = nf_ct_net(ct);
	struct nf_conntrack_ecache *e;

	switch (net->ct.sysctl_events) {
	case 0:
		 /* assignment via template / ruleset? ignore sysctl. */
		if (ctmask || expmask)
			break;
		return true;
	case 2: /* autodetect: no event listener, don't allocate extension. */
		if (!READ_ONCE(nf_ctnetlink_has_listener))
			return true;
		fallthrough;
	case 1:
		/* always allocate an extension. */
		if (!ctmask && !expmask) {
			ctmask = ~0;
			expmask = ~0;
		}
		break;
	default:
		WARN_ON_ONCE(1);
		return true;
	}

	e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
	if (e) {
		nf_ct_ecache_tstamp_new(ct, e);
		e->ctmask  = ctmask;
		e->expmask = expmask;
	}

	return e != NULL;
}
EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add);

#define NF_CT_EVENTS_DEFAULT 2
static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;

void nf_conntrack_ecache_pernet_init(struct net *net)
{
	struct nf_conntrack_net *cnet = nf_ct_pernet(net);

	net->ct.sysctl_events = nf_ct_events;

	INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work);
	INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL);
	spin_lock_init(&cnet->ecache.dying_lock);

	BUILD_BUG_ON(__IPCT_MAX >= 16);	/* e->ctmask is u16 */
}

void nf_conntrack_ecache_pernet_fini(struct net *net)
{
	struct nf_conntrack_net *cnet = nf_ct_pernet(net);

	cancel_delayed_work_sync(&cnet->ecache.dwork);
}