1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
|
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
#include <linux/mm.h>
#include <linux/llist.h>
#include <linux/bpf.h>
#include <linux/irq_work.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/memcontrol.h>
#include <asm/local.h>
/* Any context (including NMI) BPF specific memory allocator.
*
* Tracing BPF programs can attach to kprobe and fentry. Hence they
* run in unknown context where calling plain kmalloc() might not be safe.
*
* Front-end kmalloc() with per-cpu per-bucket cache of free elements.
* Refill this cache asynchronously from irq_work.
*
* CPU_0 buckets
* 16 32 64 96 128 196 256 512 1024 2048 4096
* ...
* CPU_N buckets
* 16 32 64 96 128 196 256 512 1024 2048 4096
*
* The buckets are prefilled at the start.
* BPF programs always run with migration disabled.
* It's safe to allocate from cache of the current cpu with irqs disabled.
* Free-ing is always done into bucket of the current cpu as well.
* irq_work trims extra free elements from buckets with kfree
* and refills them with kmalloc, so global kmalloc logic takes care
* of freeing objects allocated by one cpu and freed on another.
*
* Every allocated objected is padded with extra 8 bytes that contains
* struct llist_node.
*/
#define LLIST_NODE_SZ sizeof(struct llist_node)
/* similar to kmalloc, but sizeof == 8 bucket is gone */
static u8 size_index[24] __ro_after_init = {
3, /* 8 */
3, /* 16 */
4, /* 24 */
4, /* 32 */
5, /* 40 */
5, /* 48 */
5, /* 56 */
5, /* 64 */
1, /* 72 */
1, /* 80 */
1, /* 88 */
1, /* 96 */
6, /* 104 */
6, /* 112 */
6, /* 120 */
6, /* 128 */
2, /* 136 */
2, /* 144 */
2, /* 152 */
2, /* 160 */
2, /* 168 */
2, /* 176 */
2, /* 184 */
2 /* 192 */
};
static int bpf_mem_cache_idx(size_t size)
{
if (!size || size > 4096)
return -1;
if (size <= 192)
return size_index[(size - 1) / 8] - 1;
return fls(size - 1) - 2;
}
#define NUM_CACHES 11
struct bpf_mem_cache {
/* per-cpu list of free objects of size 'unit_size'.
* All accesses are done with interrupts disabled and 'active' counter
* protection with __llist_add() and __llist_del_first().
*/
struct llist_head free_llist;
local_t active;
/* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill
* are sequenced by per-cpu 'active' counter. But unit_free() cannot
* fail. When 'active' is busy the unit_free() will add an object to
* free_llist_extra.
*/
struct llist_head free_llist_extra;
struct irq_work refill_work;
struct obj_cgroup *objcg;
int unit_size;
/* count of objects in free_llist */
int free_cnt;
int low_watermark, high_watermark, batch;
int percpu_size;
struct rcu_head rcu;
struct llist_head free_by_rcu;
struct llist_head waiting_for_gp;
atomic_t call_rcu_in_progress;
};
struct bpf_mem_caches {
struct bpf_mem_cache cache[NUM_CACHES];
};
static struct llist_node notrace *__llist_del_first(struct llist_head *head)
{
struct llist_node *entry, *next;
entry = head->first;
if (!entry)
return NULL;
next = entry->next;
head->first = next;
return entry;
}
static void *__alloc(struct bpf_mem_cache *c, int node)
{
/* Allocate, but don't deplete atomic reserves that typical
* GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
* will allocate from the current numa node which is what we
* want here.
*/
gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
if (c->percpu_size) {
void **obj = kmalloc_node(c->percpu_size, flags, node);
void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
if (!obj || !pptr) {
free_percpu(pptr);
kfree(obj);
return NULL;
}
obj[1] = pptr;
return obj;
}
return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node);
}
static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
{
#ifdef CONFIG_MEMCG_KMEM
if (c->objcg)
return get_mem_cgroup_from_objcg(c->objcg);
#endif
#ifdef CONFIG_MEMCG
return root_mem_cgroup;
#else
return NULL;
#endif
}
/* Mostly runs from irq_work except __init phase. */
static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
{
struct mem_cgroup *memcg = NULL, *old_memcg;
unsigned long flags;
void *obj;
int i;
memcg = get_memcg(c);
old_memcg = set_active_memcg(memcg);
for (i = 0; i < cnt; i++) {
obj = __alloc(c, node);
if (!obj)
break;
if (IS_ENABLED(CONFIG_PREEMPT_RT))
/* In RT irq_work runs in per-cpu kthread, so disable
* interrupts to avoid preemption and interrupts and
* reduce the chance of bpf prog executing on this cpu
* when active counter is busy.
*/
local_irq_save(flags);
/* alloc_bulk runs from irq_work which will not preempt a bpf
* program that does unit_alloc/unit_free since IRQs are
* disabled there. There is no race to increment 'active'
* counter. It protects free_llist from corruption in case NMI
* bpf prog preempted this loop.
*/
WARN_ON_ONCE(local_inc_return(&c->active) != 1);
__llist_add(obj, &c->free_llist);
c->free_cnt++;
local_dec(&c->active);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_restore(flags);
}
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
}
static void free_one(struct bpf_mem_cache *c, void *obj)
{
if (c->percpu_size) {
free_percpu(((void **)obj)[1]);
kfree(obj);
return;
}
kfree(obj);
}
static void __free_rcu(struct rcu_head *head)
{
struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
struct llist_node *llnode = llist_del_all(&c->waiting_for_gp);
struct llist_node *pos, *t;
llist_for_each_safe(pos, t, llnode)
free_one(c, pos);
atomic_set(&c->call_rcu_in_progress, 0);
}
static void __free_rcu_tasks_trace(struct rcu_head *head)
{
struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
call_rcu(&c->rcu, __free_rcu);
}
static void enque_to_free(struct bpf_mem_cache *c, void *obj)
{
struct llist_node *llnode = obj;
/* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work.
* Nothing races to add to free_by_rcu list.
*/
__llist_add(llnode, &c->free_by_rcu);
}
static void do_call_rcu(struct bpf_mem_cache *c)
{
struct llist_node *llnode, *t;
if (atomic_xchg(&c->call_rcu_in_progress, 1))
return;
WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
/* There is no concurrent __llist_add(waiting_for_gp) access.
* It doesn't race with llist_del_all either.
* But there could be two concurrent llist_del_all(waiting_for_gp):
* from __free_rcu() and from drain_mem_cache().
*/
__llist_add(llnode, &c->waiting_for_gp);
/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
* Then use call_rcu() to wait for normal progs to finish
* and finally do free_one() on each element.
*/
call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
}
static void free_bulk(struct bpf_mem_cache *c)
{
struct llist_node *llnode, *t;
unsigned long flags;
int cnt;
do {
if (IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_save(flags);
WARN_ON_ONCE(local_inc_return(&c->active) != 1);
llnode = __llist_del_first(&c->free_llist);
if (llnode)
cnt = --c->free_cnt;
else
cnt = 0;
local_dec(&c->active);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_restore(flags);
if (llnode)
enque_to_free(c, llnode);
} while (cnt > (c->high_watermark + c->low_watermark) / 2);
/* and drain free_llist_extra */
llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra))
enque_to_free(c, llnode);
do_call_rcu(c);
}
static void bpf_mem_refill(struct irq_work *work)
{
struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
int cnt;
/* Racy access to free_cnt. It doesn't need to be 100% accurate */
cnt = c->free_cnt;
if (cnt < c->low_watermark)
/* irq_work runs on this cpu and kmalloc will allocate
* from the current numa node which is what we want here.
*/
alloc_bulk(c, c->batch, NUMA_NO_NODE);
else if (cnt > c->high_watermark)
free_bulk(c);
}
static void notrace irq_work_raise(struct bpf_mem_cache *c)
{
irq_work_queue(&c->refill_work);
}
/* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket
* the freelist cache will be elem_size * 64 (or less) on each cpu.
*
* For bpf programs that don't have statically known allocation sizes and
* assuming (low_mark + high_mark) / 2 as an average number of elements per
* bucket and all buckets are used the total amount of memory in freelists
* on each cpu will be:
* 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096
* == ~ 116 Kbyte using below heuristic.
* Initialized, but unused bpf allocator (not bpf map specific one) will
* consume ~ 11 Kbyte per cpu.
* Typical case will be between 11K and 116K closer to 11K.
* bpf progs can and should share bpf_mem_cache when possible.
*/
static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
{
init_irq_work(&c->refill_work, bpf_mem_refill);
if (c->unit_size <= 256) {
c->low_watermark = 32;
c->high_watermark = 96;
} else {
/* When page_size == 4k, order-0 cache will have low_mark == 2
* and high_mark == 6 with batch alloc of 3 individual pages at
* a time.
* 8k allocs and above low == 1, high == 3, batch == 1.
*/
c->low_watermark = max(32 * 256 / c->unit_size, 1);
c->high_watermark = max(96 * 256 / c->unit_size, 3);
}
c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1);
/* To avoid consuming memory assume that 1st run of bpf
* prog won't be doing more than 4 map_update_elem from
* irq disabled region
*/
alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
}
/* When size != 0 bpf_mem_cache for each cpu.
* This is typical bpf hash map use case when all elements have equal size.
*
* When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
* kmalloc/kfree. Max allocation size is 4096 in this case.
* This is bpf_dynptr and bpf_kptr use case.
*/
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
{
static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
struct bpf_mem_caches *cc, __percpu *pcc;
struct bpf_mem_cache *c, __percpu *pc;
struct obj_cgroup *objcg = NULL;
int cpu, i, unit_size, percpu_size = 0;
if (size) {
pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
if (!pc)
return -ENOMEM;
if (percpu)
/* room for llist_node and per-cpu pointer */
percpu_size = LLIST_NODE_SZ + sizeof(void *);
else
size += LLIST_NODE_SZ; /* room for llist_node */
unit_size = size;
#ifdef CONFIG_MEMCG_KMEM
objcg = get_obj_cgroup_from_current();
#endif
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(pc, cpu);
c->unit_size = unit_size;
c->objcg = objcg;
c->percpu_size = percpu_size;
prefill_mem_cache(c, cpu);
}
ma->cache = pc;
return 0;
}
/* size == 0 && percpu is an invalid combination */
if (WARN_ON_ONCE(percpu))
return -EINVAL;
pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
if (!pcc)
return -ENOMEM;
#ifdef CONFIG_MEMCG_KMEM
objcg = get_obj_cgroup_from_current();
#endif
for_each_possible_cpu(cpu) {
cc = per_cpu_ptr(pcc, cpu);
for (i = 0; i < NUM_CACHES; i++) {
c = &cc->cache[i];
c->unit_size = sizes[i];
c->objcg = objcg;
prefill_mem_cache(c, cpu);
}
}
ma->caches = pcc;
return 0;
}
static void drain_mem_cache(struct bpf_mem_cache *c)
{
struct llist_node *llnode, *t;
/* No progs are using this bpf_mem_cache, but htab_map_free() called
* bpf_mem_cache_free() for all remaining elements and they can be in
* free_by_rcu or in waiting_for_gp lists, so drain those lists now.
*
* Except for waiting_for_gp list, there are no concurrent operations
* on these lists, so it is safe to use __llist_del_all().
*/
llist_for_each_safe(llnode, t, __llist_del_all(&c->free_by_rcu))
free_one(c, llnode);
llist_for_each_safe(llnode, t, llist_del_all(&c->waiting_for_gp))
free_one(c, llnode);
llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist))
free_one(c, llnode);
llist_for_each_safe(llnode, t, __llist_del_all(&c->free_llist_extra))
free_one(c, llnode);
}
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
{
free_percpu(ma->cache);
free_percpu(ma->caches);
ma->cache = NULL;
ma->caches = NULL;
}
static void free_mem_alloc(struct bpf_mem_alloc *ma)
{
/* waiting_for_gp lists was drained, but __free_rcu might
* still execute. Wait for it now before we freeing percpu caches.
*/
rcu_barrier_tasks_trace();
rcu_barrier();
free_mem_alloc_no_barrier(ma);
}
static void free_mem_alloc_deferred(struct work_struct *work)
{
struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work);
free_mem_alloc(ma);
kfree(ma);
}
static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress)
{
struct bpf_mem_alloc *copy;
if (!rcu_in_progress) {
/* Fast path. No callbacks are pending, hence no need to do
* rcu_barrier-s.
*/
free_mem_alloc_no_barrier(ma);
return;
}
copy = kmalloc(sizeof(*ma), GFP_KERNEL);
if (!copy) {
/* Slow path with inline barrier-s */
free_mem_alloc(ma);
return;
}
/* Defer barriers into worker to let the rest of map memory to be freed */
copy->cache = ma->cache;
ma->cache = NULL;
copy->caches = ma->caches;
ma->caches = NULL;
INIT_WORK(©->work, free_mem_alloc_deferred);
queue_work(system_unbound_wq, ©->work);
}
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
{
struct bpf_mem_caches *cc;
struct bpf_mem_cache *c;
int cpu, i, rcu_in_progress;
if (ma->cache) {
rcu_in_progress = 0;
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(ma->cache, cpu);
/*
* refill_work may be unfinished for PREEMPT_RT kernel
* in which irq work is invoked in a per-CPU RT thread.
* It is also possible for kernel with
* arch_irq_work_has_interrupt() being false and irq
* work is invoked in timer interrupt. So waiting for
* the completion of irq work to ease the handling of
* concurrency.
*/
irq_work_sync(&c->refill_work);
drain_mem_cache(c);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
/* objcg is the same across cpus */
if (c->objcg)
obj_cgroup_put(c->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
if (ma->caches) {
rcu_in_progress = 0;
for_each_possible_cpu(cpu) {
cc = per_cpu_ptr(ma->caches, cpu);
for (i = 0; i < NUM_CACHES; i++) {
c = &cc->cache[i];
irq_work_sync(&c->refill_work);
drain_mem_cache(c);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
}
if (c->objcg)
obj_cgroup_put(c->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
}
/* notrace is necessary here and in other functions to make sure
* bpf programs cannot attach to them and cause llist corruptions.
*/
static void notrace *unit_alloc(struct bpf_mem_cache *c)
{
struct llist_node *llnode = NULL;
unsigned long flags;
int cnt = 0;
/* Disable irqs to prevent the following race for majority of prog types:
* prog_A
* bpf_mem_alloc
* preemption or irq -> prog_B
* bpf_mem_alloc
*
* but prog_B could be a perf_event NMI prog.
* Use per-cpu 'active' counter to order free_list access between
* unit_alloc/unit_free/bpf_mem_refill.
*/
local_irq_save(flags);
if (local_inc_return(&c->active) == 1) {
llnode = __llist_del_first(&c->free_llist);
if (llnode)
cnt = --c->free_cnt;
}
local_dec(&c->active);
local_irq_restore(flags);
WARN_ON(cnt < 0);
if (cnt < c->low_watermark)
irq_work_raise(c);
return llnode;
}
/* Though 'ptr' object could have been allocated on a different cpu
* add it to the free_llist of the current cpu.
* Let kfree() logic deal with it when it's later called from irq_work.
*/
static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
{
struct llist_node *llnode = ptr - LLIST_NODE_SZ;
unsigned long flags;
int cnt = 0;
BUILD_BUG_ON(LLIST_NODE_SZ > 8);
local_irq_save(flags);
if (local_inc_return(&c->active) == 1) {
__llist_add(llnode, &c->free_llist);
cnt = ++c->free_cnt;
} else {
/* unit_free() cannot fail. Therefore add an object to atomic
* llist. free_bulk() will drain it. Though free_llist_extra is
* a per-cpu list we have to use atomic llist_add here, since
* it also can be interrupted by bpf nmi prog that does another
* unit_free() into the same free_llist_extra.
*/
llist_add(llnode, &c->free_llist_extra);
}
local_dec(&c->active);
local_irq_restore(flags);
if (cnt > c->high_watermark)
/* free few objects from current cpu into global kmalloc pool */
irq_work_raise(c);
}
/* Called from BPF program or from sys_bpf syscall.
* In both cases migration is disabled.
*/
void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
{
int idx;
void *ret;
if (!size)
return ZERO_SIZE_PTR;
idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
if (idx < 0)
return NULL;
ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
{
int idx;
if (!ptr)
return;
idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
if (idx < 0)
return;
unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
}
void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
{
void *ret;
ret = unit_alloc(this_cpu_ptr(ma->cache));
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
{
if (!ptr)
return;
unit_free(this_cpu_ptr(ma->cache), ptr);
}
|