1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835
|
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package kernel
import (
gocontext "context"
"runtime/trace"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/atomicbitops"
"gvisor.dev/gvisor/pkg/bpf"
"gvisor.dev/gvisor/pkg/errors/linuxerr"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/waiter"
)
// Task represents a thread of execution in the untrusted app. It
// includes registers and any thread-specific state that you would
// normally expect.
//
// Each task is associated with a goroutine, called the task goroutine, that
// executes code (application code, system calls, etc.) on behalf of that task.
// See Task.run (task_run.go).
//
// All fields that are "owned by the task goroutine" can only be mutated by the
// task goroutine while it is running. The task goroutine does not require
// synchronization to read these fields, although it still requires
// synchronization as described for those fields to mutate them.
//
// All fields that are "exclusive to the task goroutine" can only be accessed
// by the task goroutine while it is running. The task goroutine does not
// require synchronization to read or write these fields.
//
// +stateify savable
type Task struct {
taskNode
// goid is the task goroutine's ID. goid is owned by the task goroutine,
// but since it's used to detect cases where non-task goroutines
// incorrectly access state owned by, or exclusive to, the task goroutine,
// goid is always accessed using atomic memory operations.
goid atomicbitops.Int64 `state:"nosave"`
// runState is what the task goroutine is executing if it is not stopped.
// If runState is nil, the task goroutine should exit or has exited.
// runState is exclusive to the task goroutine.
runState taskRunState
// taskWorkCount represents the current size of the task work queue. It is
// used to avoid acquiring taskWorkMu when the queue is empty.
taskWorkCount atomicbitops.Int32
// taskWorkMu protects taskWork.
taskWorkMu sync.Mutex `state:"nosave"`
// taskWork is a queue of work to be executed before resuming user execution.
// It is similar to the task_work mechanism in Linux.
//
// taskWork is exclusive to the task goroutine.
taskWork []TaskWorker
// haveSyscallReturn is true if image.Arch().Return() represents a value
// returned by a syscall (or set by ptrace after a syscall).
//
// haveSyscallReturn is exclusive to the task goroutine.
haveSyscallReturn bool
// interruptChan is notified whenever the task goroutine is interrupted
// (usually by a pending signal). interruptChan is effectively a condition
// variable that can be used in select statements.
//
// interruptChan is not saved; because saving interrupts all tasks,
// interruptChan is always notified after restore (see Task.run).
interruptChan chan struct{} `state:"nosave"`
// gosched contains the current scheduling state of the task goroutine.
//
// gosched is protected by goschedSeq. gosched is owned by the task
// goroutine.
goschedSeq sync.SeqCount `state:"nosave"`
gosched TaskGoroutineSchedInfo
// yieldCount is the number of times the task goroutine has called
// Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or
// Task.Yield(), voluntarily ceasing execution.
//
// yieldCount is accessed using atomic memory operations. yieldCount is
// owned by the task goroutine.
yieldCount atomicbitops.Uint64
// pendingSignals is the set of pending signals that may be handled only by
// this task.
//
// pendingSignals is protected by (taskNode.)tg.signalHandlers.mu
// (hereafter "the signal mutex"); see comment on
// ThreadGroup.signalHandlers.
pendingSignals pendingSignals
// signalMask is the set of signals whose delivery is currently blocked.
//
// signalMask is accessed using atomic memory operations, and is protected
// by the signal mutex (such that reading signalMask is safe if either the
// signal mutex is locked or if atomic memory operations are used, while
// writing signalMask requires both). signalMask is owned by the task
// goroutine.
signalMask atomicbitops.Uint64
// If the task goroutine is currently executing Task.sigtimedwait,
// realSignalMask is the previous value of signalMask, which has temporarily
// been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0.
//
// realSignalMask is exclusive to the task goroutine.
realSignalMask linux.SignalSet
// If haveSavedSignalMask is true, savedSignalMask is the signal mask that
// should be applied after the task has either delivered one signal to a
// user handler or is about to resume execution in the untrusted
// application.
//
// Both haveSavedSignalMask and savedSignalMask are exclusive to the task
// goroutine.
haveSavedSignalMask bool
savedSignalMask linux.SignalSet
// signalStack is the alternate signal stack used by signal handlers for
// which the SA_ONSTACK flag is set.
//
// signalStack is exclusive to the task goroutine.
signalStack linux.SignalStack
// signalQueue is a set of registered waiters for signal-related events.
//
// signalQueue is protected by the signalMutex. Note that the task does
// not implement all queue methods, specifically the readiness checks.
// The task only broadcast a notification on signal delivery.
signalQueue waiter.Queue
// If groupStopPending is true, the task should participate in a group
// stop in the interrupt path.
//
// groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux.
//
// groupStopPending is protected by the signal mutex.
groupStopPending bool
// If groupStopAcknowledged is true, the task has already acknowledged that
// it is entering the most recent group stop that has been initiated on its
// thread group.
//
// groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux.
//
// groupStopAcknowledged is protected by the signal mutex.
groupStopAcknowledged bool
// If trapStopPending is true, the task goroutine should enter a
// PTRACE_INTERRUPT-induced stop from the interrupt path.
//
// trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that
// Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects
// JOBCTL_STOP_PENDING.
//
// trapStopPending is protected by the signal mutex.
trapStopPending bool
// If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group
// stop has begun or ended since the last time the task entered a
// ptrace-stop from the group-stop path.
//
// trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux.
//
// trapNotifyPending is protected by the signal mutex.
trapNotifyPending bool
// If stop is not nil, it is the internally-initiated condition that
// currently prevents the task goroutine from running.
//
// stop is protected by the signal mutex.
stop TaskStop
// stopCount is the number of active external stops (calls to
// Task.BeginExternalStop that have not been paired with a call to
// Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is
// non-zero if the task goroutine should stop.
//
// Mutating stopCount requires both locking the signal mutex and using
// atomic memory operations. Reading stopCount requires either locking the
// signal mutex or using atomic memory operations. This allows Task.doStop
// to require only a single atomic read in the common case where stopCount
// is 0.
//
// stopCount is not saved, because external stops cannot be retained across
// a save/restore cycle. (Suppose a sentryctl command issues an external
// stop; after a save/restore cycle, the restored sentry has no knowledge
// of the pre-save sentryctl command, and the stopped task would remain
// stopped forever.)
stopCount atomicbitops.Int32 `state:"nosave"`
// endStopCond is signaled when stopCount transitions to 0. The combination
// of stopCount and endStopCond effectively form a sync.WaitGroup, but
// WaitGroup provides no way to read its counter value.
//
// Invariant: endStopCond.L is the signal mutex. (This is not racy because
// sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine
// calls sync.Cond.Wait; and only the task goroutine can change the
// identity of the signal mutex, in Task.finishExec.)
endStopCond sync.Cond `state:"nosave"`
// exitStatus is the task's exit status.
//
// exitStatus is protected by the signal mutex.
exitStatus linux.WaitStatus
// syscallRestartBlock represents a custom restart function to run in
// restart_syscall(2) to resume an interrupted syscall.
//
// syscallRestartBlock is exclusive to the task goroutine.
syscallRestartBlock SyscallRestartBlock
// p provides the mechanism by which the task runs code in userspace. The p
// interface object is immutable.
p platform.Context `state:"nosave"`
// k is the Kernel that this task belongs to. The k pointer is immutable.
k *Kernel
// containerID has no equivalent in Linux; it's used by runsc to track all
// tasks that belong to a given containers since cgroups aren't implemented.
// It's inherited by the children, is immutable, and may be empty.
//
// NOTE: cgroups can be used to track this when implemented.
containerID string
// mu protects some of the following fields.
mu taskMutex `state:"nosave"`
// image holds task data provided by the ELF loader.
//
// image is protected by mu, and is owned by the task goroutine.
image TaskImage
// fsContext is the task's filesystem context.
//
// fsContext is protected by mu, and is owned by the task goroutine.
fsContext *FSContext
// fdTable is the task's file descriptor table.
//
// fdTable is protected by mu, and is owned by the task goroutine.
fdTable *FDTable
// If vforkParent is not nil, it is the task that created this task with
// vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
// this TaskImage is released.
//
// vforkParent is protected by the TaskSet mutex.
vforkParent *Task
// exitState is the task's progress through the exit path.
//
// exitState is protected by the TaskSet mutex. exitState is owned by the
// task goroutine.
exitState TaskExitState
// exitTracerNotified is true if the exit path has either signaled the
// task's tracer to indicate the exit, or determined that no such signal is
// needed. exitTracerNotified can only be true if exitState is
// TaskExitZombie or TaskExitDead.
//
// exitTracerNotified is protected by the TaskSet mutex.
exitTracerNotified bool
// exitTracerAcked is true if exitTracerNotified is true and either the
// task's tracer has acknowledged the exit notification, or the exit path
// has determined that no such notification is needed.
//
// exitTracerAcked is protected by the TaskSet mutex.
exitTracerAcked bool
// exitParentNotified is true if the exit path has either signaled the
// task's parent to indicate the exit, or determined that no such signal is
// needed. exitParentNotified can only be true if exitState is
// TaskExitZombie or TaskExitDead.
//
// exitParentNotified is protected by the TaskSet mutex.
exitParentNotified bool
// exitParentAcked is true if exitParentNotified is true and either the
// task's parent has acknowledged the exit notification, or the exit path
// has determined that no such acknowledgment is needed.
//
// exitParentAcked is protected by the TaskSet mutex.
exitParentAcked bool
// goroutineStopped is a WaitGroup whose counter value is 1 when the task
// goroutine is running and 0 when the task goroutine is stopped or has
// exited.
goroutineStopped sync.WaitGroup `state:"nosave"`
// ptraceTracer is the task that is ptrace-attached to this one. If
// ptraceTracer is nil, this task is not being traced. Note that due to
// atomic.Value limitations (atomic.Value.Store(nil) panics), a nil
// ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)).
//
// ptraceTracer is protected by the TaskSet mutex, and accessed with atomic
// operations. This allows paths that wouldn't otherwise lock the TaskSet
// mutex, notably the syscall path, to check if ptraceTracer is nil without
// additional synchronization.
ptraceTracer atomic.Value `state:".(*Task)"`
// ptraceTracees is the set of tasks that this task is ptrace-attached to.
//
// ptraceTracees is protected by the TaskSet mutex.
ptraceTracees map[*Task]struct{}
// ptraceSeized is true if ptraceTracer attached to this task with
// PTRACE_SEIZE.
//
// ptraceSeized is protected by the TaskSet mutex.
ptraceSeized bool
// ptraceOpts contains ptrace options explicitly set by the tracer. If
// ptraceTracer is nil, ptraceOpts is expected to be the zero value.
//
// ptraceOpts is protected by the TaskSet mutex.
ptraceOpts ptraceOptions
// ptraceSyscallMode controls ptrace behavior around syscall entry and
// exit.
//
// ptraceSyscallMode is protected by the TaskSet mutex.
ptraceSyscallMode ptraceSyscallMode
// If ptraceSinglestep is true, the next time the task executes application
// code, single-stepping should be enabled. ptraceSinglestep is stored
// independently of the architecture-specific trap flag because tracer
// detaching (which can happen concurrently with the tracee's execution if
// the tracer exits) must disable single-stepping, and the task's
// architectural state is implicitly exclusive to the task goroutine (no
// synchronization occurs before passing registers to SwitchToApp).
//
// ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP.
//
// ptraceSinglestep is protected by the TaskSet mutex.
ptraceSinglestep bool
// If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the
// time that t entered the ptrace stop, reset to 0 when the tracer
// acknowledges the stop with a wait*() syscall. Otherwise, it is the
// signal number passed to the ptrace operation that ended the last ptrace
// stop on this task. In the latter case, the effect of ptraceCode depends
// on the nature of the ptrace stop; signal-delivery-stop uses it to
// conditionally override ptraceSiginfo, syscall-entry/exit-stops send the
// signal to the task after leaving the stop, and PTRACE_EVENT stops and
// traced group stops ignore it entirely.
//
// Linux contextually stores the equivalent of ptraceCode in
// task_struct::exit_code.
//
// ptraceCode is protected by the TaskSet mutex.
ptraceCode int32
// ptraceSiginfo is the value returned to the tracer by
// ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO).
// (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.)
// ptraceSiginfo is nil if the task is in a ptraced group-stop (this is
// required for PTRACE_GETSIGINFO to return EINVAL during such stops, which
// is in turn required to distinguish group stops from other ptrace stops,
// per subsection "Group-stop" in ptrace(2)).
//
// ptraceSiginfo is analogous to Linux's task_struct::last_siginfo.
//
// ptraceSiginfo is protected by the TaskSet mutex.
ptraceSiginfo *linux.SignalInfo
// ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to
// the tracer by ptrace(PTRACE_GETEVENTMSG).
//
// ptraceEventMsg is protected by the TaskSet mutex.
ptraceEventMsg uint64
// ptraceYAMAExceptionAdded is true if a YAMA exception involving the task has
// been added before. This is used during task exit to decide whether we need
// to clean up YAMA exceptions.
//
// ptraceYAMAExceptionAdded is protected by the TaskSet mutex.
ptraceYAMAExceptionAdded bool
// The struct that holds the IO-related usage. The ioUsage pointer is
// immutable.
ioUsage *usage.IO
// logPrefix is a string containing the task's thread ID in the root PID
// namespace, and is prepended to log messages emitted by Task.Infof etc.
logPrefix atomic.Value `state:"nosave"`
// traceContext and traceTask are both used for tracing, and are
// updated along with the logPrefix in updateInfoLocked.
//
// These are exclusive to the task goroutine.
traceContext gocontext.Context `state:"nosave"`
traceTask *trace.Task `state:"nosave"`
// creds is the task's credentials.
//
// creds.Load() may be called without synchronization. creds.Store() is
// serialized by mu. creds is owned by the task goroutine. All
// auth.Credentials objects that creds may point to, or have pointed to
// in the past, must be treated as immutable.
creds auth.AtomicPtrCredentials
// utsns is the task's UTS namespace.
//
// utsns is protected by mu. utsns is owned by the task goroutine.
utsns *UTSNamespace
// ipcns is the task's IPC namespace.
//
// ipcns is protected by mu. ipcns is owned by the task goroutine.
ipcns *IPCNamespace
// abstractSockets tracks abstract sockets that are in use.
//
// abstractSockets is protected by mu.
abstractSockets *AbstractSocketNamespace
// mountNamespace is the task's mount namespace.
//
// It is protected by mu. It is owned by the task goroutine.
mountNamespace *vfs.MountNamespace
// parentDeathSignal is sent to this task's thread group when its parent exits.
//
// parentDeathSignal is protected by mu.
parentDeathSignal linux.Signal
// syscallFilters is all seccomp-bpf syscall filters applicable to the
// task, in the order in which they were installed. The type of the atomic
// is []bpf.Program. Writing needs to be protected by the signal mutex.
//
// syscallFilters is owned by the task goroutine.
syscallFilters atomic.Value `state:".([]bpf.Program)"`
// If cleartid is non-zero, treat it as a pointer to a ThreadID in the
// task's virtual address space; when the task exits, set the pointed-to
// ThreadID to 0, and wake any futex waiters.
//
// cleartid is exclusive to the task goroutine.
cleartid hostarch.Addr
// This is mostly a fake cpumask just for sched_set/getaffinity as we
// don't really control the affinity.
//
// Invariant: allowedCPUMask.Size() ==
// sched.CPUMaskSize(Kernel.applicationCores).
//
// allowedCPUMask is protected by mu.
allowedCPUMask sched.CPUSet
// cpu is the fake cpu number returned by getcpu(2). cpu is ignored
// entirely if Kernel.useHostCores is true.
cpu atomicbitops.Int32
// This is used to keep track of changes made to a process' priority/niceness.
// It is mostly used to provide some reasonable return value from
// getpriority(2) after a call to setpriority(2) has been made.
// We currently do not actually modify a process' scheduling priority.
// NOTE: This represents the userspace view of priority (nice).
// This means that the value should be in the range [-20, 19].
//
// niceness is protected by mu.
niceness int
// This is used to track the numa policy for the current thread. This can be
// modified through a set_mempolicy(2) syscall. Since we always report a
// single numa node, all policies are no-ops. We only track this information
// so that we can return reasonable values if the application calls
// get_mempolicy(2) after setting a non-default policy. Note that in the
// real syscall, nodemask can be longer than a single unsigned long, but we
// always report a single node so never need to save more than a single
// bit.
//
// numaPolicy and numaNodeMask are protected by mu.
numaPolicy linux.NumaPolicy
numaNodeMask uint64
// netns is the task's network namespace. netns is never nil.
netns inet.NamespaceAtomicPtr
// If rseqPreempted is true, before the next call to p.Switch(),
// interrupt rseq critical regions as defined by rseqAddr and
// tg.oldRSeqCritical and write the task goroutine's CPU number to
// rseqAddr/oldRSeqCPUAddr.
//
// We support two ABIs for restartable sequences:
//
// 1. The upstream interface added in v4.18,
// 2. An "old" interface never merged upstream. In the implementation,
// this is referred to as "old rseq".
//
// rseqPreempted is exclusive to the task goroutine.
rseqPreempted bool `state:"nosave"`
// rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr.
//
// If rseq is unused, rseqCPU is -1 for convenient use in
// platform.Context.Switch.
//
// rseqCPU is exclusive to the task goroutine.
rseqCPU int32
// oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable.
//
// oldRSeqCPUAddr is exclusive to the task goroutine.
oldRSeqCPUAddr hostarch.Addr
// rseqAddr is a pointer to the userspace linux.RSeq structure.
//
// rseqAddr is exclusive to the task goroutine.
rseqAddr hostarch.Addr
// rseqSignature is the signature that the rseq abort IP must be signed
// with.
//
// rseqSignature is exclusive to the task goroutine.
rseqSignature uint32
// copyScratchBuffer is a buffer available to CopyIn/CopyOut
// implementations that require an intermediate buffer to copy data
// into/out of. It prevents these buffers from being allocated/zeroed in
// each syscall and eventually garbage collected.
//
// copyScratchBuffer is exclusive to the task goroutine.
copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"`
// blockingTimer is used for blocking timeouts. blockingTimerChan is the
// channel that is sent to when blockingTimer fires.
//
// blockingTimer is exclusive to the task goroutine.
blockingTimer *ktime.Timer `state:"nosave"`
blockingTimerChan <-chan struct{} `state:"nosave"`
// futexWaiter is used for futex(FUTEX_WAIT) syscalls.
//
// futexWaiter is exclusive to the task goroutine.
futexWaiter *futex.Waiter `state:"nosave"`
// robustList is a pointer to the head of the tasks's robust futex
// list.
robustList hostarch.Addr
// startTime is the real time at which the task started. It is set when
// a Task is created or invokes execve(2).
//
// startTime is protected by mu.
startTime ktime.Time
// kcov is the kcov instance providing code coverage owned by this task.
//
// kcov is exclusive to the task goroutine.
kcov *Kcov
// cgroups is the set of cgroups this task belongs to. This may be empty if
// no cgroup controllers are enabled. Protected by mu.
//
// +checklocks:mu
cgroups map[Cgroup]struct{}
// userCounters is a pointer to a set of user counters.
//
// The userCounters pointer is exclusive to the task goroutine, but the
// userCounters instance must be atomically accessed.
userCounters *userCounters
}
// Task related metrics
var (
// syscallCounter is a metric that tracks how many syscalls the sentry has
// executed.
syscallCounter = metric.MustCreateNewProfilingUint64Metric(
"/task/syscalls", false, "The number of syscalls the sentry has executed for the user.")
// faultCounter is a metric that tracks how many faults the sentry has had to
// handle.
faultCounter = metric.MustCreateNewProfilingUint64Metric(
"/task/faults", false, "The number of faults the sentry has handled.")
)
func (t *Task) savePtraceTracer() *Task {
return t.ptraceTracer.Load().(*Task)
}
func (t *Task) loadPtraceTracer(tracer *Task) {
t.ptraceTracer.Store(tracer)
}
func (t *Task) saveSyscallFilters() []bpf.Program {
if f := t.syscallFilters.Load(); f != nil {
return f.([]bpf.Program)
}
return nil
}
func (t *Task) loadSyscallFilters(filters []bpf.Program) {
t.syscallFilters.Store(filters)
}
// afterLoad is invoked by stateify.
func (t *Task) afterLoad() {
t.updateInfoLocked()
t.interruptChan = make(chan struct{}, 1)
t.gosched.State = TaskGoroutineNonexistent
if t.stop != nil {
t.stopCount = atomicbitops.FromInt32(1)
}
t.endStopCond.L = &t.tg.signalHandlers.mu
t.rseqPreempted = true
t.futexWaiter = futex.NewWaiter()
t.p = t.k.Platform.NewContext(t.AsyncContext())
}
// copyScratchBufferLen is the length of Task.copyScratchBuffer.
const copyScratchBufferLen = 144 // sizeof(struct stat)
// CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut
// functions. It must only be used within those functions and can only be used
// by the task goroutine; it exists to improve performance and thus
// intentionally lacks any synchronization.
//
// Callers should pass a constant value as an argument if possible, which will
// allow the compiler to inline and optimize out the if statement below.
func (t *Task) CopyScratchBuffer(size int) []byte {
if size > copyScratchBufferLen {
return make([]byte, size)
}
return t.copyScratchBuffer[:size]
}
// FutexWaiter returns the Task's futex.Waiter.
func (t *Task) FutexWaiter() *futex.Waiter {
return t.futexWaiter
}
// Kernel returns the Kernel containing t.
func (t *Task) Kernel() *Kernel {
return t.k
}
// SetClearTID sets t's cleartid.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) SetClearTID(addr hostarch.Addr) {
t.cleartid = addr
}
// SetSyscallRestartBlock sets the restart block for use in
// restart_syscall(2). After registering a restart block, a syscall should
// return ERESTART_RESTARTBLOCK to request a restart using the block.
//
// Precondition: The caller must be running on the task goroutine.
func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) {
t.syscallRestartBlock = r
}
// SyscallRestartBlock returns the currently registered restart block for use in
// restart_syscall(2). This function is *not* idempotent and may be called once
// per syscall. This function must not be called if a restart block has not been
// registered for the current syscall.
//
// Precondition: The caller must be running on the task goroutine.
func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
r := t.syscallRestartBlock
// Explicitly set the restart block to nil so that a future syscall can't
// accidentally reuse it.
t.syscallRestartBlock = nil
return r
}
// IsChrooted returns true if the root directory of t's FSContext is not the
// root directory of t's MountNamespace.
//
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) IsChrooted() bool {
realRoot := t.mountNamespace.Root()
root := t.fsContext.RootDirectory()
defer root.DecRef(t)
return root != realRoot
}
// TaskImage returns t's TaskImage.
//
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
func (t *Task) TaskImage() *TaskImage {
return &t.image
}
// FSContext returns t's FSContext. FSContext does not take an additional
// reference on the returned FSContext.
//
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
func (t *Task) FSContext() *FSContext {
return t.fsContext
}
// FDTable returns t's FDTable. FDMTable does not take an additional reference
// on the returned FDMap.
//
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
func (t *Task) FDTable() *FDTable {
return t.fdTable
}
// GetFile is a convenience wrapper for t.FDTable().Get.
//
// Precondition: same as FDTable.Get.
func (t *Task) GetFile(fd int32) *vfs.FileDescription {
f, _ := t.fdTable.Get(fd)
return f
}
// NewFDs is a convenience wrapper for t.FDTable().NewFDs.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDs(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) {
return t.fdTable.NewFDs(t, fd, files, flags)
}
// NewFDFrom is a convenience wrapper for t.FDTable().NewFD.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.Get.
func (t *Task) NewFDFrom(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
return t.fdTable.NewFD(t, fd, file, flags)
}
// NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDAt(fd int32, file *vfs.FileDescription, flags FDFlags) error {
return t.fdTable.NewFDAt(t, fd, file, flags)
}
// WithMuLocked executes f with t.mu locked.
func (t *Task) WithMuLocked(f func(*Task)) {
t.mu.Lock()
f(t)
t.mu.Unlock()
}
// MountNamespace returns t's MountNamespace. A reference is taken on the
// returned mount namespace.
func (t *Task) MountNamespace() *vfs.MountNamespace {
t.mu.Lock()
defer t.mu.Unlock()
return t.mountNamespace
}
// AbstractSockets returns t's AbstractSocketNamespace.
func (t *Task) AbstractSockets() *AbstractSocketNamespace {
return t.abstractSockets
}
// ContainerID returns t's container ID.
func (t *Task) ContainerID() string {
return t.containerID
}
// OOMScoreAdj gets the task's thread group's OOM score adjustment.
func (t *Task) OOMScoreAdj() int32 {
return t.tg.oomScoreAdj.Load()
}
// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The
// value should be between -1000 and 1000 inclusive.
func (t *Task) SetOOMScoreAdj(adj int32) error {
if adj > 1000 || adj < -1000 {
return linuxerr.EINVAL
}
t.tg.oomScoreAdj.Store(adj)
return nil
}
// KUID returns t's kuid.
func (t *Task) KUID() uint32 {
return uint32(t.Credentials().EffectiveKUID)
}
// KGID returns t's kgid.
func (t *Task) KGID() uint32 {
return uint32(t.Credentials().EffectiveKGID)
}
// SetKcov sets the kcov instance associated with t.
func (t *Task) SetKcov(k *Kcov) {
t.kcov = k
}
// ResetKcov clears the kcov instance associated with t.
func (t *Task) ResetKcov() {
if t.kcov != nil {
t.kcov.OnTaskExit()
t.kcov = nil
}
}
|