File: systrap.go

package info (click to toggle)
golang-gvisor-gvisor 0.0~20240729.0-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 21,276 kB
  • sloc: asm: 3,361; ansic: 1,197; cpp: 348; makefile: 92; python: 89; sh: 83
file content (445 lines) | stat: -rw-r--r-- 13,492 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package systrap provides a seccomp-based implementation of the platform
// interface.
//
// In a nutshell, it works as follows:
//
// The creation of a new address space creates a new child processes.
//
// The creation of a new stub thread creates a new system thread with a
// specified address space. To initialize this thread, the following action
// will be done:
//   - install a signal stack which is shared with the Sentry.
//   - install a signal handler for SYS, BUS, FPE, CHLD, TRAP, SEGV signals.
//     This signal handler is a key part of the systrap platform. Any stub event
//     which has to be handled in a privilege mode (by the Sentry) triggers one of
//     previous signals. The signal handler is running on the separate stack which
//     is shared with the Sentry. There is the sysmsg structure to synchronize the
//     Sentry and a stub thread.
//   - install seccomp filters to trap user system calls.
//   - send a fake SIGSEGV to stop the thread in the signal handler.
//
// A platformContext is just a collection of temporary variables. Calling Switch on a
// platformContext does the following:
//
//	Set up proper registers and an FPU state on a stub signal frame.
//	Wake up a stub thread by changing sysmsg->stage and calling FUTEX_WAKE.
//	Wait for new stub event by polling sysmsg->stage.
//
// Lock order:
//
//	subprocessPool.mu
//		subprocess.mu
//			platformContext.mu
//
// +checkalignedignore
package systrap

import (
	"fmt"
	"os"
	"runtime"
	"sync"

	"golang.org/x/sys/unix"
	"gvisor.dev/gvisor/pkg/abi/linux"
	pkgcontext "gvisor.dev/gvisor/pkg/context"
	"gvisor.dev/gvisor/pkg/fd"
	"gvisor.dev/gvisor/pkg/hostarch"
	"gvisor.dev/gvisor/pkg/memutil"
	"gvisor.dev/gvisor/pkg/sentry/arch"
	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
	"gvisor.dev/gvisor/pkg/sentry/platform"
	"gvisor.dev/gvisor/pkg/sentry/platform/interrupt"
	"gvisor.dev/gvisor/pkg/sentry/platform/systrap/sysmsg"
	"gvisor.dev/gvisor/pkg/sentry/platform/systrap/usertrap"
)

var (
	// stubStart is the link address for our stub, and determines the
	// maximum user address. This is valid only after a call to stubInit.
	//
	// We attempt to link the stub here, and adjust downward as needed.
	stubStart uintptr = stubInitAddress

	stubInitProcess uintptr

	// Memory region to store thread specific stacks.
	stubSysmsgStack uintptr
	stubSysmsgStart uintptr
	stubSysmsgEnd   uintptr
	// Memory region to store the contextQueue.
	stubContextQueueRegion    uintptr
	stubContextQueueRegionLen uintptr
	// Memory region to store instances of sysmsg.ThreadContext.
	stubContextRegion    uintptr
	stubContextRegionLen uintptr
	// The memory blob with precompiled seccomp rules.
	stubSysmsgRules     uintptr
	stubSysmsgRulesLen  uintptr
	stubSyscallRules    uintptr
	stubSyscallRulesLen uintptr

	stubSpinningThreadQueueAddr uintptr
	stubSpinningThreadQueueSize uintptr

	// stubROMapEnd is the end address of the read-only stub region that
	// contains the code and precompiled seccomp rules.
	stubROMapEnd uintptr

	// stubEnd is the first byte past the end of the stub, as with
	// stubStart this is valid only after a call to stubInit.
	stubEnd uintptr

	// stubInitialized controls one-time stub initialization.
	stubInitialized sync.Once

	// latencyMonitoring controls one-time initialization of the fastpath
	// control goroutine.
	latencyMonitoring sync.Once

	// archState stores architecture-specific details used in the platform.
	archState sysmsg.ArchState
)

// platformContext is an implementation of the platform context.
type platformContext struct {
	// signalInfo is the signal info, if and when a signal is received.
	signalInfo linux.SignalInfo

	// interrupt is the interrupt platformContext.
	interrupt interrupt.Forwarder

	// sharedContext is everything related to this platformContext that is resident in
	// shared memory with the stub thread.
	// sharedContext is only accessed on the Task goroutine, therefore it is not
	// mutex protected.
	sharedContext *sharedContext

	// mu protects the following fields.
	mu sync.Mutex

	// If lastFaultSP is non-nil, the last platformContext switch was due to a fault
	// received while executing lastFaultSP. Only platformContext.Switch may set
	// lastFaultSP to a non-nil value.
	lastFaultSP *subprocess

	// lastFaultAddr is the last faulting address; this is only meaningful if
	// lastFaultSP is non-nil.
	lastFaultAddr hostarch.Addr

	// lastFaultIP is the address of the last faulting instruction;
	// this is also only meaningful if lastFaultSP is non-nil.
	lastFaultIP hostarch.Addr

	// needRestoreFPState indicates that the FPU state has been changed by
	// the Sentry and has to be updated on the stub thread.
	needRestoreFPState bool

	// needToPullFullState indicates that the Sentry doesn't have a full
	// state of the thread.
	needToPullFullState bool
}

// PullFullState implements platform.Context.PullFullState.
func (c *platformContext) PullFullState(as platform.AddressSpace, ac *arch.Context64) error {
	if !c.needToPullFullState {
		return nil
	}
	s := as.(*subprocess)
	if err := s.PullFullState(c, ac); err != nil {
		return err
	}
	c.needToPullFullState = false
	return nil
}

// FullStateChanged implements platform.Context.FullStateChanged.
func (c *platformContext) FullStateChanged() {
	c.needRestoreFPState = true
	c.needToPullFullState = false
}

// Switch runs the provided platformContext in the given address space.
func (c *platformContext) Switch(ctx pkgcontext.Context, mm platform.MemoryManager, ac *arch.Context64, cpu int32) (*linux.SignalInfo, hostarch.AccessType, error) {
	as := mm.AddressSpace()
	s := as.(*subprocess)
	if err := s.activateContext(c); err != nil {
		return nil, hostarch.NoAccess, err
	}

restart:
	isSyscall, needPatch, err := s.switchToApp(c, ac)
	if err != nil {
		return nil, hostarch.NoAccess, err
	}
	if needPatch {
		s.usertrap.PatchSyscall(ctx, ac, mm)
	}
	if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGILL {
		err := s.usertrap.HandleFault(ctx, ac, mm)
		if err == usertrap.ErrFaultSyscall {
			isSyscall = true
		} else if err == usertrap.ErrFaultRestart {
			goto restart
		} else if err != nil {
			ctx.Warningf("usertrap.HandleFault failed: %v", err)
		}
	}
	var (
		faultSP   *subprocess
		faultAddr hostarch.Addr
		faultIP   hostarch.Addr
	)
	if !isSyscall && linux.Signal(c.signalInfo.Signo) == linux.SIGSEGV {
		faultSP = s
		faultAddr = hostarch.Addr(c.signalInfo.Addr())
		faultIP = hostarch.Addr(ac.IP())
	}

	// Update the platformContext to reflect the outcome of this context switch.
	c.mu.Lock()
	lastFaultSP := c.lastFaultSP
	lastFaultAddr := c.lastFaultAddr
	lastFaultIP := c.lastFaultIP
	// At this point, c may not yet be in s.faultedContexts, so c.lastFaultSP won't
	// be updated by s.Unmap(). This is fine; we only need to synchronize with
	// calls to s.Unmap() that occur after the handling of this fault.
	c.lastFaultSP = faultSP
	c.lastFaultAddr = faultAddr
	c.lastFaultIP = faultIP
	c.mu.Unlock()

	// Update subprocesses to reflect the outcome of this context switch.
	if lastFaultSP != faultSP {
		if lastFaultSP != nil {
			lastFaultSP.mu.Lock()
			delete(lastFaultSP.faultedContexts, c)
			lastFaultSP.mu.Unlock()
		}
		if faultSP != nil {
			faultSP.mu.Lock()
			faultSP.faultedContexts[c] = struct{}{}
			faultSP.mu.Unlock()
		}
	}

	if isSyscall {
		return nil, hostarch.NoAccess, nil
	}

	si := c.signalInfo
	if faultSP == nil {
		// Non-fault signal.
		return &si, hostarch.NoAccess, platform.ErrContextSignal
	}

	// See if this can be handled as a CPUID exception.
	if linux.Signal(si.Signo) == linux.SIGSEGV && platform.TryCPUIDEmulate(ctx, mm, ac) {
		goto restart
	}

	// Got a page fault. Ideally, we'd get real fault type here, but ptrace
	// doesn't expose this information. Instead, we use a simple heuristic:
	//
	// It was an instruction fault iff the faulting addr == instruction
	// pointer.
	//
	// It was a write fault if the fault is immediately repeated.
	at := hostarch.Read
	if faultAddr == faultIP {
		at.Execute = true
	}
	if lastFaultSP == faultSP &&
		lastFaultAddr == faultAddr &&
		lastFaultIP == faultIP {
		at.Write = true
	}

	// Handle as a signal.
	return &si, at, platform.ErrContextSignal
}

// Interrupt interrupts the running guest application associated with this platformContext.
func (c *platformContext) Interrupt() {
	c.interrupt.NotifyInterrupt()
}

// Release releases all platform resources used by the platformContext.
func (c *platformContext) Release() {
	if c.sharedContext != nil {
		c.sharedContext.release()
		c.sharedContext = nil
	}
}

// PrepareSleep implements platform.Context.platform.PrepareSleep.
func (c *platformContext) PrepareSleep() {
	ctx := c.sharedContext
	if ctx == nil {
		return
	}
	if !ctx.sleeping {
		ctx.sleeping = true
		ctx.subprocess.decAwakeContexts()
	}
}

// Systrap represents a collection of seccomp subprocesses.
type Systrap struct {
	platform.NoCPUPreemptionDetection
	platform.UseHostGlobalMemoryBarrier
	platform.DoesNotOwnPageTables

	// memoryFile is used to create a stub sysmsg stack
	// which is shared with the Sentry.
	memoryFile *pgalloc.MemoryFile
}

// MinUserAddress implements platform.MinUserAddress.
func (*Systrap) MinUserAddress() hostarch.Addr {
	return platform.SystemMMapMinAddr()
}

// New returns a new seccomp-based implementation of the platform interface.
func New() (*Systrap, error) {
	if maxSysmsgThreads == 0 {
		// CPUID information has been initialized at this point.
		archState.Init()
		// GOMAXPROCS has been set at this point.
		maxSysmsgThreads = runtime.GOMAXPROCS(0)
		// Account for syscall thread.
		maxChildThreads = maxSysmsgThreads + 1
	}

	mf, err := createMemoryFile()
	if err != nil {
		return nil, err
	}

	stubInitialized.Do(func() {
		// Don't use sentry and stub fast paths if here is just one cpu.
		neverEnableFastPath = min(runtime.NumCPU(), runtime.GOMAXPROCS(0)) == 1

		// Initialize the stub.
		stubInit()

		// Create the source process for the global pool. This must be
		// done before initializing any other processes.
		source, err := newSubprocess(createStub, mf, false)
		if err != nil {
			// Should never happen.
			panic("unable to initialize systrap source: " + err.Error())
		}
		// The source subprocess is never released explicitly by a MM.
		source.DecRef(nil)

		globalPool.source = source

		initSysmsgThreadPriority()

		initSeccompNotify()
	})

	latencyMonitoring.Do(func() {
		go controlFastPath()
	})

	return &Systrap{memoryFile: mf}, nil
}

// SupportsAddressSpaceIO implements platform.Platform.SupportsAddressSpaceIO.
func (*Systrap) SupportsAddressSpaceIO() bool {
	return false
}

// CooperativelySchedulesAddressSpace implements platform.Platform.CooperativelySchedulesAddressSpace.
func (*Systrap) CooperativelySchedulesAddressSpace() bool {
	return false
}

// MapUnit implements platform.Platform.MapUnit.
func (*Systrap) MapUnit() uint64 {
	// The host kernel manages page tables and arbitrary-sized mappings
	// have effectively the same cost.
	return 0
}

// MaxUserAddress returns the first address that may not be used by user
// applications.
func (*Systrap) MaxUserAddress() hostarch.Addr {
	return hostarch.Addr(maxStubUserAddress)
}

// NewAddressSpace returns a new subprocess.
func (p *Systrap) NewAddressSpace(any) (platform.AddressSpace, <-chan struct{}, error) {
	as, err := newSubprocess(globalPool.source.createStub, p.memoryFile, true)
	return as, nil, err
}

// NewContext returns an interruptible platformContext.
func (*Systrap) NewContext(ctx pkgcontext.Context) platform.Context {
	return &platformContext{
		needRestoreFPState:  true,
		needToPullFullState: false,
	}
}

type constructor struct{}

func (*constructor) New(_ *fd.FD) (platform.Platform, error) {
	return New()
}

func (*constructor) OpenDevice(_ string) (*fd.FD, error) {
	return nil, nil
}

// Requirements implements platform.Constructor.Requirements().
func (*constructor) Requirements() platform.Requirements {
	// TODO(b/75837838): Also set a new PID namespace so that we limit
	// access to other host processes.
	return platform.Requirements{
		RequiresCapSysPtrace: true,
		RequiresCurrentPIDNS: true,
	}
}

func init() {
	platform.Register("systrap", &constructor{})
}

func createMemoryFile() (*pgalloc.MemoryFile, error) {
	const memfileName = "systrap-memory"
	fd, err := memutil.CreateMemFD(memfileName, 0)
	if err != nil {
		return nil, fmt.Errorf("error creating memfd: %v", err)
	}
	memfile := os.NewFile(uintptr(fd), memfileName)
	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
	if err != nil {
		memfile.Close()
		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
	}
	return mf, nil
}

func corruptedSharedMemoryErr(additional string) *platform.ContextError {
	return &platform.ContextError{
		Err:   fmt.Errorf("systrap corrupted memory: %s", additional),
		Errno: unix.EPERM,
	}
}