1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
|
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#include <linux/ns_common.h>
#include <linux/nstree.h>
#include <linux/proc_ns.h>
#include <linux/user_namespace.h>
#include <linux/vfsdebug.h>
#ifdef CONFIG_DEBUG_VFS
static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
{
switch (ns->ns_type) {
#ifdef CONFIG_CGROUPS
case CLONE_NEWCGROUP:
VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
break;
#endif
#ifdef CONFIG_IPC_NS
case CLONE_NEWIPC:
VFS_WARN_ON_ONCE(ops != &ipcns_operations);
break;
#endif
case CLONE_NEWNS:
VFS_WARN_ON_ONCE(ops != &mntns_operations);
break;
#ifdef CONFIG_NET_NS
case CLONE_NEWNET:
VFS_WARN_ON_ONCE(ops != &netns_operations);
break;
#endif
#ifdef CONFIG_PID_NS
case CLONE_NEWPID:
VFS_WARN_ON_ONCE(ops != &pidns_operations);
break;
#endif
#ifdef CONFIG_TIME_NS
case CLONE_NEWTIME:
VFS_WARN_ON_ONCE(ops != &timens_operations);
break;
#endif
#ifdef CONFIG_USER_NS
case CLONE_NEWUSER:
VFS_WARN_ON_ONCE(ops != &userns_operations);
break;
#endif
#ifdef CONFIG_UTS_NS
case CLONE_NEWUTS:
VFS_WARN_ON_ONCE(ops != &utsns_operations);
break;
#endif
}
}
#endif
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
{
int ret = 0;
refcount_set(&ns->__ns_ref, 1);
ns->stashed = NULL;
ns->ops = ops;
ns->ns_id = 0;
ns->ns_type = ns_type;
ns_tree_node_init(&ns->ns_tree_node);
ns_tree_node_init(&ns->ns_unified_node);
ns_tree_node_init(&ns->ns_owner_node);
ns_tree_root_init(&ns->ns_owner_root);
#ifdef CONFIG_DEBUG_VFS
ns_debug(ns, ops);
#endif
if (inum)
ns->inum = inum;
else
ret = proc_alloc_inum(&ns->inum);
if (ret)
return ret;
/*
* Tree ref starts at 0. It's incremented when namespace enters
* active use (installed in nsproxy) and decremented when all
* active uses are gone. Initial namespaces are always active.
*/
if (is_ns_init_inum(ns))
atomic_set(&ns->__ns_ref_active, 1);
else
atomic_set(&ns->__ns_ref_active, 0);
return 0;
}
void __ns_common_free(struct ns_common *ns)
{
proc_free_inum(ns->inum);
}
struct ns_common *__must_check ns_owner(struct ns_common *ns)
{
struct user_namespace *owner;
if (unlikely(!ns->ops))
return NULL;
VFS_WARN_ON_ONCE(!ns->ops->owner);
owner = ns->ops->owner(ns);
VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
if (!owner)
return NULL;
/* Skip init_user_ns as it's always active */
if (owner == &init_user_ns)
return NULL;
return to_ns_common(owner);
}
/*
* The active reference count works by having each namespace that gets
* created take a single active reference on its owning user namespace.
* That single reference is only released once the child namespace's
* active count itself goes down.
*
* A regular namespace tree might look as follow:
* Legend:
* + : adding active reference
* - : dropping active reference
* x : always active (initial namespace)
*
*
* net_ns pid_ns
* \ /
* + +
* user_ns1 (2)
* |
* ipc_ns | uts_ns
* \ | /
* + + +
* user_ns2 (3)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* If both net_ns and pid_ns put their last active reference on
* themselves it will cascade to user_ns1 dropping its own active
* reference and dropping one active reference on user_ns2:
*
* net_ns pid_ns
* \ /
* - -
* user_ns1 (0)
* |
* ipc_ns | uts_ns
* \ | /
* + - +
* user_ns2 (2)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* The iteration stops once we reach a namespace that still has active
* references.
*/
void __ns_ref_active_put(struct ns_common *ns)
{
/* Initial namespaces are always active. */
if (is_ns_init_id(ns))
return;
if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
return;
}
VFS_WARN_ON_ONCE(is_ns_init_id(ns));
VFS_WARN_ON_ONCE(!__ns_ref_read(ns));
for (;;) {
ns = ns_owner(ns);
if (!ns)
return;
VFS_WARN_ON_ONCE(is_ns_init_id(ns));
if (!atomic_dec_and_test(&ns->__ns_ref_active)) {
VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0);
return;
}
}
}
/*
* The active reference count works by having each namespace that gets
* created take a single active reference on its owning user namespace.
* That single reference is only released once the child namespace's
* active count itself goes down. This makes it possible to efficiently
* resurrect a namespace tree:
*
* A regular namespace tree might look as follow:
* Legend:
* + : adding active reference
* - : dropping active reference
* x : always active (initial namespace)
*
*
* net_ns pid_ns
* \ /
* + +
* user_ns1 (2)
* |
* ipc_ns | uts_ns
* \ | /
* + + +
* user_ns2 (3)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* If both net_ns and pid_ns put their last active reference on
* themselves it will cascade to user_ns1 dropping its own active
* reference and dropping one active reference on user_ns2:
*
* net_ns pid_ns
* \ /
* - -
* user_ns1 (0)
* |
* ipc_ns | uts_ns
* \ | /
* + - +
* user_ns2 (2)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* Assume the whole tree is dead but all namespaces are still active:
*
* net_ns pid_ns
* \ /
* - -
* user_ns1 (0)
* |
* ipc_ns | uts_ns
* \ | /
* - - -
* user_ns2 (0)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
*
* net_ns pid_ns
* \ /
* + -
* user_ns1 (0)
* |
* ipc_ns | uts_ns
* \ | /
* - + -
* user_ns2 (0)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* If net_ns had a zero reference count and we bumped it we also need to
* take another reference on its owning user namespace. Similarly, if
* pid_ns had a zero reference count it also needs to take another
* reference on its owning user namespace. So both net_ns and pid_ns
* will each have their own reference on the owning user namespace.
*
* If the owning user namespace user_ns1 had a zero reference count then
* it also needs to take another reference on its owning user namespace
* and so on.
*/
void __ns_ref_active_get(struct ns_common *ns)
{
int prev;
/* Initial namespaces are always active. */
if (is_ns_init_id(ns))
return;
/* If we didn't resurrect the namespace we're done. */
prev = atomic_fetch_add(1, &ns->__ns_ref_active);
VFS_WARN_ON_ONCE(prev < 0);
if (likely(prev))
return;
/*
* We did resurrect it. Walk the ownership hierarchy upwards
* until we found an owning user namespace that is active.
*/
for (;;) {
ns = ns_owner(ns);
if (!ns)
return;
VFS_WARN_ON_ONCE(is_ns_init_id(ns));
prev = atomic_fetch_add(1, &ns->__ns_ref_active);
VFS_WARN_ON_ONCE(prev < 0);
if (likely(prev))
return;
}
}
|