1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
|
// Copyright 2022 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cgroupfs
import (
"bytes"
"fmt"
"strings"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/usermem"
)
// pidMaxLimit is the maximum number of pids allowed on a 64-bit system. The
// practical limit is much lower. See Linux, include/linux/threads.h.
const pidMaxLimit = 4 * 1024 * 1024
const pidLimitUnlimited = pidMaxLimit + 1
// pidsController tracks how many pids are used by tasks in a cgroup. This is
// used to limit the number of tasks per cgroup. The limit is enforced only when
// new tasks are created via Fork/Clone. Task migrations and limit changes can
// cause the current number of pids to exceed the limit.
//
// A task can charge a PIDs cgroup in two ways:
//
// 1. A task created prior to the PIDs controller being enabled, or created
// through kernel.CreateProcess (i.e. not from userspace) directly add
// committed charges via the Enter method.
//
// 2. A task created through Task.Clone (i.e. userspace fork/clone) first add a
// pending charge through the Charge method. This is a temporary reservation
// which ensures the cgroup has enough space to allow the task to start. Once
// the task startup succeeds, it calls Enter and consumes the reservation.
//
// +stateify savable
type pidsController struct {
controllerCommon
// isRoot indiciates if this is the root cgroup in its hierarchy. Immutable
// since cgroupfs doesn't allow cross directory renames.
isRoot bool
// mu protects the fields below.
mu pidsControllerMutex `state:"nosave"`
// pendingTotal and pendingPool tracks the charge for processes starting
// up. During startup, we check if PIDs are available by charging the
// cgroup. However, the process actually joins the cgroup as a later point
// via Enter. We keep a count of the charges we allocated via Charge, and
// use this pool to account for already accounted charges from Enter.
//
// We also track which task owns the pending charge so we can cancel the
// charge if a task creation fails after the Charge call.
//
// pendingTotal and pendingPool are both protected by mu.
pendingTotal int64
pendingPool map[*kernel.Task]int64
// committed represent charges for tasks that have already started and
// called Enter. Protected by mu.
committed int64
// max is the PID limit for this cgroup. Protected by mu.
max int64
}
var _ controller = (*pidsController)(nil)
// newRootPIDsController creates the root node for a PIDs cgroup. Child
// directories should be created through Clone.
func newRootPIDsController(fs *filesystem) *pidsController {
c := &pidsController{
isRoot: true,
max: pidLimitUnlimited,
pendingPool: make(map[*kernel.Task]int64),
}
c.controllerCommon.init(kernel.CgroupControllerPIDs, fs)
return c
}
// Clone implements controller.Clone.
func (c *pidsController) Clone() controller {
c.mu.Lock()
defer c.mu.Unlock()
new := &pidsController{
isRoot: false,
max: pidLimitUnlimited,
pendingPool: make(map[*kernel.Task]int64),
}
new.controllerCommon.cloneFromParent(c)
return new
}
// AddControlFiles implements controller.AddControlFiles.
func (c *pidsController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) {
contents["pids.current"] = c.fs.newControllerFile(ctx, creds, &pidsCurrentData{c: c}, true)
if !c.isRoot {
// "This is not available in the root cgroup for obvious reasons" --
// Linux, Documentation/cgroup-v1/pids.txt.
contents["pids.max"] = c.fs.newControllerWritableFile(ctx, creds, &pidsMaxData{c: c}, true)
}
}
// Enter implements controller.Enter.
//
// Enter attempts to commit a charge from the pending pool. If at least one
// charge is pending for t, one pending charge is converted to a commited
// charge, and the net change in total charges is zero. If no charge is pending,
// a new charge is added directly to the committed pool.
func (c *pidsController) Enter(t *kernel.Task) {
c.mu.Lock()
defer c.mu.Unlock()
if pending, ok := c.pendingPool[t]; ok {
if pending == 1 {
delete(c.pendingPool, t)
} else {
c.pendingPool[t] = pending - 1
}
c.pendingTotal--
if c.pendingTotal < 0 {
panic(fmt.Sprintf("cgroupfs: pids controller has negative pending charge: %v\n", c.committed))
}
}
// Either we're converting a pending charge from above, or generating a new
// committed charge directly here. Either way, we don't enforce the limit on
// Enter.
c.committed++
}
// Leave implements controller.Leave.
func (c *pidsController) Leave(t *kernel.Task) {
c.mu.Lock()
defer c.mu.Unlock()
if c.committed <= 0 {
panic(fmt.Sprintf("cgroupfs: pids controller committed charge underflow on Leave for task %+v", t))
}
c.committed--
}
// PrepareMigrate implements controller.PrepareMigrate.
func (c *pidsController) PrepareMigrate(t *kernel.Task, src controller) error {
srcC := src.(*pidsController)
srcC.mu.Lock()
defer srcC.mu.Unlock()
if _, ok := srcC.pendingPool[t]; ok {
// Migrating task isn't fully initialized, return transient failure.
return linuxerr.EAGAIN
}
return nil
}
// CommitMigrate implements controller.CommitMigrate.
//
// Migrations can cause a cgroup to exceed its limit. CommitMigrate can only be
// called for tasks with committed charges, PrepareMigrate will deny migrations
// prior to Enter.
func (c *pidsController) CommitMigrate(t *kernel.Task, src controller) {
// Note: The charge is allowed to exceed max on migration. The charge may
// not exceed max when incurred due to a fork/clone, which will call
// pidsController.Charge().
c.mu.Lock()
c.committed++
c.mu.Unlock()
srcC := src.(*pidsController)
srcC.mu.Lock()
if srcC.committed <= 0 {
panic(fmt.Sprintf("cgroupfs: pids controller committed charge underflow on CommitMigrate for task %+v on the source cgroup", t))
}
srcC.committed--
srcC.mu.Unlock()
}
// AbortMigrate implements controller.AbortMigrate.
func (c *pidsController) AbortMigrate(t *kernel.Task, src controller) {}
// Charge implements controller.Charge. This manipulates the pending
// pool. Charge are committed from the pending pool by Enter. The caller is
// responsible for ensuring negative charges correspond to previous positive
// charges. Negative charges that cause an underflow result in a panic.
func (c *pidsController) Charge(t *kernel.Task, d *kernfs.Dentry, res kernel.CgroupResourceType, value int64) error {
if res != kernel.CgroupResourcePID {
panic(fmt.Sprintf("cgroupfs: pids controller invalid resource type %v", res))
}
c.mu.Lock()
defer c.mu.Unlock()
// Negative charge.
if value < 0 {
if c.pendingTotal+value < 0 {
panic(fmt.Sprintf("cgroupfs: pids controller pending pool would be negative if charge was allowed: current pool: %d, proposed charge: %d, path: %q, task: %p", c.pendingTotal, value, d.FSLocalPath(), t))
}
pending, ok := c.pendingPool[t]
if !ok {
panic(fmt.Sprintf("cgroupfs: pids controller attempted to remove pending charge for Task %p, but task didn't have pending charges, path: %q", t, d.FSLocalPath()))
}
if pending+value < 0 {
panic(fmt.Sprintf("cgroupfs: pids controller attempted to remove pending charge for Task %p, but task didn't have enough pending charges; current charges: %d, proposed charge: %d, path: %q", t, pending, value, d.FSLocalPath()))
}
c.pendingPool[t] += value
c.pendingTotal += value
return nil
}
// Positive charge.
new := c.committed + c.pendingTotal + value
if new > c.max {
log.Debugf("cgroupfs: pids controller charge denied due to limit: path: %q, requested: %d, current: %d (pending: %v, committed: %v), max: %v",
d.FSLocalPath(), value, c.committed+c.pendingTotal, c.pendingTotal, c.committed, c.max)
return linuxerr.EAGAIN
}
c.pendingPool[t] += value
c.pendingTotal += value
return nil
}
// +stateify savable
type pidsCurrentData struct {
c *pidsController
}
// Generate implements vfs.DynamicBytesSource.Generate.
func (d *pidsCurrentData) Generate(ctx context.Context, buf *bytes.Buffer) error {
d.c.mu.Lock()
defer d.c.mu.Unlock()
fmt.Fprintf(buf, "%d\n", d.c.committed+d.c.pendingTotal)
return nil
}
// +stateify savable
type pidsMaxData struct {
c *pidsController
}
// Generate implements vfs.DynamicBytesSource.Generate.
func (d *pidsMaxData) Generate(ctx context.Context, buf *bytes.Buffer) error {
d.c.mu.Lock()
defer d.c.mu.Unlock()
if d.c.max > pidMaxLimit {
fmt.Fprintf(buf, "max\n")
} else {
fmt.Fprintf(buf, "%d\n", d.c.max)
}
return nil
}
// Write implements vfs.WritableDynamicBytesSource.Write.
func (d *pidsMaxData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
return d.WriteBackground(ctx, src)
}
// WriteBackground implements writableControllerFileImpl.WriteBackground.
func (d *pidsMaxData) WriteBackground(ctx context.Context, src usermem.IOSequence) (int64, error) {
buf := copyScratchBufferFromContext(ctx, hostarch.PageSize)
ncpy, err := src.CopyIn(ctx, buf)
if err != nil {
return 0, err
}
if strings.TrimSpace(string(buf)) == "max" {
d.c.mu.Lock()
defer d.c.mu.Unlock()
d.c.max = pidLimitUnlimited
return int64(ncpy), nil
}
val, n, err := parseInt64FromString(ctx, src)
if err != nil {
return 0, linuxerr.EINVAL
}
if val < 0 || val > pidMaxLimit {
return 0, linuxerr.EINVAL
}
d.c.mu.Lock()
defer d.c.mu.Unlock()
d.c.max = val
return int64(n), nil
}
|