1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
|
/*
* SPDX-FileCopyrightText: Copyright (c) 2016-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "nv-kthread-q.h"
#include "nv-list-helpers.h"
#include <linux/kthread.h>
#include <linux/interrupt.h>
#include <linux/completion.h>
#include <linux/module.h>
#include <linux/mm.h>
#if defined(NV_LINUX_BUG_H_PRESENT)
#include <linux/bug.h>
#else
#include <asm/bug.h>
#endif
// Today's implementation is a little simpler and more limited than the
// API description allows for in nv-kthread-q.h. Details include:
//
// 1. Each nv_kthread_q instance is a first-in, first-out queue.
//
// 2. Each nv_kthread_q instance is serviced by exactly one kthread.
//
// You can create any number of queues, each of which gets its own
// named kernel thread (kthread). You can then insert arbitrary functions
// into the queue, and those functions will be run in the context of the
// queue's kthread.
#ifndef WARN
// Only *really* old kernels (2.6.9) end up here. Just use a simple printk
// to implement this, because such kernels won't be supported much longer.
#define WARN(condition, format...) ({ \
int __ret_warn_on = !!(condition); \
if (unlikely(__ret_warn_on)) \
printk(KERN_ERR format); \
unlikely(__ret_warn_on); \
})
#endif
#define NVQ_WARN(fmt, ...) \
do { \
if (in_interrupt()) { \
WARN(1, "nv_kthread_q: [in interrupt]: " fmt, \
##__VA_ARGS__); \
} \
else { \
WARN(1, "nv_kthread_q: task: %s: " fmt, \
current->comm, \
##__VA_ARGS__); \
} \
} while (0)
static int _main_loop(void *args)
{
nv_kthread_q_t *q = (nv_kthread_q_t *)args;
nv_kthread_q_item_t *q_item = NULL;
unsigned long flags;
while (1) {
// Normally this thread is never interrupted. However,
// down_interruptible (instead of down) is called here,
// in order to avoid being classified as a potentially
// hung task, by the kernel watchdog.
while (down_interruptible(&q->q_sem))
NVQ_WARN("Interrupted during semaphore wait\n");
if (atomic_read(&q->main_loop_should_exit))
break;
spin_lock_irqsave(&q->q_lock, flags);
// The q_sem semaphore prevents us from getting here unless there is
// at least one item in the list, so an empty list indicates a bug.
if (unlikely(list_empty(&q->q_list_head))) {
spin_unlock_irqrestore(&q->q_lock, flags);
NVQ_WARN("_main_loop: Empty queue: q: 0x%p\n", q);
continue;
}
// Consume one item from the queue
q_item = list_first_entry(&q->q_list_head,
nv_kthread_q_item_t,
q_list_node);
list_del_init(&q_item->q_list_node);
spin_unlock_irqrestore(&q->q_lock, flags);
// Run the item
q_item->function_to_run(q_item->function_args);
// Make debugging a little simpler by clearing this between runs:
q_item = NULL;
}
while (!kthread_should_stop())
schedule();
return 0;
}
void nv_kthread_q_stop(nv_kthread_q_t *q)
{
// check if queue has been properly initialized
if (unlikely(!q->q_kthread))
return;
nv_kthread_q_flush(q);
// If this assertion fires, then a caller likely either broke the API rules,
// by adding items after calling nv_kthread_q_stop, or possibly messed up
// with inadequate flushing of self-rescheduling q_items.
if (unlikely(!list_empty(&q->q_list_head)))
NVQ_WARN("list not empty after flushing\n");
if (likely(!atomic_read(&q->main_loop_should_exit))) {
atomic_set(&q->main_loop_should_exit, 1);
// Wake up the kthread so that it can see that it needs to stop:
up(&q->q_sem);
kthread_stop(q->q_kthread);
q->q_kthread = NULL;
}
}
// When CONFIG_VMAP_STACK is defined, the kernel thread stack allocator used by
// kthread_create_on_node relies on a 2 entry, per-core cache to minimize
// vmalloc invocations. The cache is NUMA-unaware, so when there is a hit, the
// stack location ends up being a function of the core assigned to the current
// thread, instead of being a function of the specified NUMA node. The cache was
// added to the kernel in commit ac496bf48d97f2503eaa353996a4dd5e4383eaf0
// ("fork: Optimize task creation by caching two thread stacks per CPU if
// CONFIG_VMAP_STACK=y")
//
// To work around the problematic cache, we create up to three kernel threads
// -If the first thread's stack is resident on the preferred node, return this
// thread.
// -Otherwise, create a second thread. If its stack is resident on the
// preferred node, stop the first thread and return this one.
// -Otherwise, create a third thread. The stack allocator does not find a
// cached stack, and so falls back to vmalloc, which takes the NUMA hint into
// consideration. The first two threads are then stopped.
//
// When CONFIG_VMAP_STACK is not defined, the first kernel thread is returned.
//
// This function is never invoked when there is no NUMA preference (preferred
// node is NUMA_NO_NODE).
static struct task_struct *thread_create_on_node(int (*threadfn)(void *data),
nv_kthread_q_t *q,
int preferred_node,
const char *q_name)
{
unsigned i, j;
static const unsigned attempts = 3;
struct task_struct *thread[3];
for (i = 0;; i++) {
struct page *stack;
thread[i] = kthread_create_on_node(threadfn, q, preferred_node, q_name);
if (unlikely(IS_ERR(thread[i]))) {
// Instead of failing, pick the previous thread, even if its
// stack is not allocated on the preferred node.
if (i > 0)
i--;
break;
}
// vmalloc is not used to allocate the stack, so simply return the
// thread, even if its stack may not be allocated on the preferred node
if (!is_vmalloc_addr(thread[i]->stack))
break;
// Ran out of attempts - return thread even if its stack may not be
// allocated on the preferred node
if ((i == (attempts - 1)))
break;
// Get the NUMA node where the first page of the stack is resident. If
// it is the preferred node, select this thread.
stack = vmalloc_to_page(thread[i]->stack);
if (page_to_nid(stack) == preferred_node)
break;
}
for (j = i; j > 0; j--)
kthread_stop(thread[j - 1]);
return thread[i];
}
int nv_kthread_q_init_on_node(nv_kthread_q_t *q, const char *q_name, int preferred_node)
{
memset(q, 0, sizeof(*q));
INIT_LIST_HEAD(&q->q_list_head);
spin_lock_init(&q->q_lock);
sema_init(&q->q_sem, 0);
if (preferred_node == NV_KTHREAD_NO_NODE) {
q->q_kthread = kthread_create(_main_loop, q, q_name);
}
else {
q->q_kthread = thread_create_on_node(_main_loop, q, preferred_node, q_name);
}
if (IS_ERR(q->q_kthread)) {
int err = PTR_ERR(q->q_kthread);
// Clear q_kthread before returning so that nv_kthread_q_stop() can be
// safely called on it making error handling easier.
q->q_kthread = NULL;
return err;
}
wake_up_process(q->q_kthread);
return 0;
}
// Returns true (non-zero) if the item was actually scheduled, and false if the
// item was already pending in a queue.
static int _raw_q_schedule(nv_kthread_q_t *q, nv_kthread_q_item_t *q_item)
{
unsigned long flags;
int ret = 1;
spin_lock_irqsave(&q->q_lock, flags);
if (likely(list_empty(&q_item->q_list_node)))
list_add_tail(&q_item->q_list_node, &q->q_list_head);
else
ret = 0;
spin_unlock_irqrestore(&q->q_lock, flags);
if (likely(ret))
up(&q->q_sem);
return ret;
}
void nv_kthread_q_item_init(nv_kthread_q_item_t *q_item,
nv_q_func_t function_to_run,
void *function_args)
{
INIT_LIST_HEAD(&q_item->q_list_node);
q_item->function_to_run = function_to_run;
q_item->function_args = function_args;
}
// Returns true (non-zero) if the q_item got scheduled, false otherwise.
int nv_kthread_q_schedule_q_item(nv_kthread_q_t *q,
nv_kthread_q_item_t *q_item)
{
if (unlikely(atomic_read(&q->main_loop_should_exit))) {
NVQ_WARN("Not allowed: nv_kthread_q_schedule_q_item was "
"called with a non-alive q: 0x%p\n", q);
return 0;
}
return _raw_q_schedule(q, q_item);
}
static void _q_flush_function(void *args)
{
struct completion *completion = (struct completion *)args;
complete(completion);
}
static void _raw_q_flush(nv_kthread_q_t *q)
{
nv_kthread_q_item_t q_item;
DECLARE_COMPLETION_ONSTACK(completion);
nv_kthread_q_item_init(&q_item, _q_flush_function, &completion);
_raw_q_schedule(q, &q_item);
// Wait for the flush item to run. Once it has run, then all of the
// previously queued items in front of it will have run, so that means
// the flush is complete.
wait_for_completion(&completion);
}
void nv_kthread_q_flush(nv_kthread_q_t *q)
{
if (unlikely(atomic_read(&q->main_loop_should_exit))) {
NVQ_WARN("Not allowed: nv_kthread_q_flush was called after "
"nv_kthread_q_stop. q: 0x%p\n", q);
return;
}
// This 2x flush is not a typing mistake. The queue really does have to be
// flushed twice, in order to take care of the case of a q_item that
// reschedules itself.
_raw_q_flush(q);
_raw_q_flush(q);
}
|