File: engine_linux.go

package info (click to toggle)
singularity-container 4.1.5%2Bds4-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 43,876 kB
  • sloc: asm: 14,840; sh: 3,190; ansic: 1,751; awk: 414; makefile: 413; python: 99
file content (400 lines) | stat: -rw-r--r-- 13,127 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
// Copyright (c) 2019-2023, Sylabs Inc. All rights reserved.
// Copyright (c) Contributors to the Apptainer project, established as
//   Apptainer a Series of LF Projects LLC.
// This software is licensed under a 3-clause BSD license. Please consult the
// LICENSE.md file distributed with the sources of this project regarding your
// rights to use or distribute this software.

package fakeroot

import (
	"context"
	"fmt"
	"net"
	"os"
	"syscall"

	specs "github.com/opencontainers/runtime-spec/specs-go"
	"github.com/sylabs/singularity/v4/internal/pkg/buildcfg"
	fakerootutil "github.com/sylabs/singularity/v4/internal/pkg/fakeroot"
	"github.com/sylabs/singularity/v4/internal/pkg/plugin"
	"github.com/sylabs/singularity/v4/internal/pkg/runtime/engine"
	"github.com/sylabs/singularity/v4/internal/pkg/runtime/engine/config/oci/generate"
	"github.com/sylabs/singularity/v4/internal/pkg/runtime/engine/config/starter"
	fakerootConfig "github.com/sylabs/singularity/v4/internal/pkg/runtime/engine/fakeroot/config"
	"github.com/sylabs/singularity/v4/internal/pkg/security/seccomp"
	"github.com/sylabs/singularity/v4/internal/pkg/util/fs"
	fakerootcallback "github.com/sylabs/singularity/v4/pkg/plugin/callback/runtime/fakeroot"
	"github.com/sylabs/singularity/v4/pkg/runtime/engine/config"
	"github.com/sylabs/singularity/v4/pkg/sylog"
	"github.com/sylabs/singularity/v4/pkg/util/capabilities"
	"github.com/sylabs/singularity/v4/pkg/util/fs/proc"
	"github.com/sylabs/singularity/v4/pkg/util/singularityconf"
)

// EngineOperations is a Singularity fakeroot runtime engine that implements engine.Operations.
type EngineOperations struct {
	CommonConfig *config.Common               `json:"-"`
	EngineConfig *fakerootConfig.EngineConfig `json:"engineConfig"`
}

// InitConfig stores the parsed config.Common inside the engine.
//
// Since this method simply stores config.Common, it does not matter
// whether or not there are any elevated privileges during this call.
func (e *EngineOperations) InitConfig(cfg *config.Common) {
	e.CommonConfig = cfg
}

// Config returns a pointer to a fakerootConfig.EngineConfig
// literal as a config.EngineConfig interface. This pointer
// gets stored in the engine.Engine.Common field.
//
// Since this method simply returns a zero value of the concrete
// EngineConfig, it does not matter whether or not there are any elevated
// privileges during this call.
func (e *EngineOperations) Config() config.EngineConfig {
	return e.EngineConfig
}

// PrepareConfig is called during stage1 to validate and prepare
// container configuration. It is responsible for singularity
// configuration file parsing, reading capabilities, configuring
// UID/GID mappings, etc.
//
// No additional privileges can be gained as any of them are already
// dropped by the time PrepareConfig is called.
func (e *EngineOperations) PrepareConfig(starterConfig *starter.Config) error {
	g := generate.New(nil)

	configurationFile := buildcfg.SINGULARITY_CONF_FILE

	// check for ownership of singularity.conf
	if starterConfig.GetIsSUID() && !fs.IsOwner(configurationFile, 0) {
		return fmt.Errorf("%s must be owned by root", configurationFile)
	}

	fileConfig, err := singularityconf.Parse(configurationFile)
	if err != nil {
		return fmt.Errorf("unable to parse singularity.conf file: %s", err)
	}

	if starterConfig.GetIsSUID() {
		if !fileConfig.AllowSetuid {
			return fmt.Errorf("fakeroot requires to set 'allow setuid = yes' in %s", configurationFile)
		}
	} else {
		sylog.Verbosef("Fakeroot requested with unprivileged workflow, fallback to newuidmap/newgidmap")
		sylog.Debugf("Search for newuidmap binary")
		if err := starterConfig.SetNewUIDMapPath(); err != nil {
			return err
		}
		sylog.Debugf("Search for newgidmap binary")
		if err := starterConfig.SetNewGIDMapPath(); err != nil {
			return err
		}
	}

	g.AddOrReplaceLinuxNamespace(specs.UserNamespace, "")
	g.AddOrReplaceLinuxNamespace(specs.MountNamespace, "")

	// If we enter a PID NS in the --oci action -> oci run flow, then crun / runc will fail.
	if !e.EngineConfig.NoPIDNS {
		g.AddOrReplaceLinuxNamespace(specs.PIDNamespace, "")
	}

	uid := uint32(os.Getuid())
	gid := uint32(os.Getgid())

	getIDRange := fakerootutil.GetIDRange

	callbackType := (fakerootcallback.UserMapping)(nil)
	callbacks, err := plugin.LoadCallbacks(callbackType)
	if err != nil {
		return fmt.Errorf("while loading plugins callbacks '%T': %s", callbackType, err)
	}
	if len(callbacks) > 1 {
		return fmt.Errorf("multiple plugins have registered hook callback for fakeroot")
	} else if len(callbacks) == 1 {
		//nolint:forcetypeassert
		getIDRange = callbacks[0].(fakerootcallback.UserMapping)
	}

	g.AddLinuxUIDMapping(uid, 0, 1)
	idRange, err := getIDRange(fakerootutil.SubUIDFile, uid)
	if err != nil {
		return fmt.Errorf("could not use fakeroot: %s", err)
	}
	g.AddLinuxUIDMapping(idRange.HostID, idRange.ContainerID, idRange.Size)
	starterConfig.AddUIDMappings(g.Config.Linux.UIDMappings)

	g.AddLinuxGIDMapping(gid, 0, 1)
	idRange, err = getIDRange(fakerootutil.SubGIDFile, uid)
	if err != nil {
		return fmt.Errorf("could not use fakeroot: %s", err)
	}
	g.AddLinuxGIDMapping(idRange.HostID, idRange.ContainerID, idRange.Size)
	starterConfig.AddGIDMappings(g.Config.Linux.GIDMappings)

	starterConfig.SetHybridWorkflow(true)
	starterConfig.SetAllowSetgroups(true)
	starterConfig.SetNoSetgroups(e.EngineConfig.NoSetgroups)

	starterConfig.SetTargetUID(0)
	starterConfig.SetTargetGID([]int{0})

	if g.Config.Linux != nil {
		starterConfig.SetNsFlagsFromSpec(g.Config.Linux.Namespaces)
	}

	g.SetupPrivileged(true)

	starterConfig.SetCapabilities(capabilities.Permitted, g.Config.Process.Capabilities.Permitted)
	starterConfig.SetCapabilities(capabilities.Effective, g.Config.Process.Capabilities.Effective)
	starterConfig.SetCapabilities(capabilities.Inheritable, g.Config.Process.Capabilities.Inheritable)
	starterConfig.SetCapabilities(capabilities.Bounding, g.Config.Process.Capabilities.Bounding)
	starterConfig.SetCapabilities(capabilities.Ambient, g.Config.Process.Capabilities.Ambient)

	return nil
}

// CreateContainer does nothing for the fakeroot engine.
func (e *EngineOperations) CreateContainer(context.Context, int, net.Conn) error {
	return nil
}

// fakerootSeccompProfile returns a seccomp filter allowing to
// set the return value to 0 for mknod and mknodat syscalls. It
// allows build bootstrap like yum to work with fakeroot.
func fakerootSeccompProfile() *specs.LinuxSeccomp {
	// sets filters allowing to create file/pipe/socket
	// and turns mknod/mknodat as no-op syscalls for block
	// devices and character devices having a device number
	// greater than 0
	zero := uint(0)
	syscalls := []specs.LinuxSyscall{
		{
			Names:    []string{"mknod"},
			Action:   specs.ActErrno,
			ErrnoRet: &zero,
			Args: []specs.LinuxSeccompArg{
				{
					Index:    1,
					Value:    syscall.S_IFBLK,
					ValueTwo: syscall.S_IFBLK,
					Op:       specs.OpMaskedEqual,
				},
			},
		},
		{
			Names:    []string{"mknod"},
			Action:   specs.ActErrno,
			ErrnoRet: &zero,
			Args: []specs.LinuxSeccompArg{
				{
					Index:    1,
					Value:    syscall.S_IFCHR,
					ValueTwo: syscall.S_IFCHR,
					Op:       specs.OpMaskedEqual,
				},
				{
					Index:    2,
					Value:    0,
					ValueTwo: 0,
					Op:       specs.OpNotEqual,
				},
			},
		},
		{
			Names:    []string{"mknodat"},
			Action:   specs.ActErrno,
			ErrnoRet: &zero,
			Args: []specs.LinuxSeccompArg{
				{
					Index:    2,
					Value:    syscall.S_IFBLK,
					ValueTwo: syscall.S_IFBLK,
					Op:       specs.OpMaskedEqual,
				},
			},
		},
		{
			Names:    []string{"mknodat"},
			Action:   specs.ActErrno,
			ErrnoRet: &zero,
			Args: []specs.LinuxSeccompArg{
				{
					Index:    2,
					Value:    syscall.S_IFCHR,
					ValueTwo: syscall.S_IFCHR,
					Op:       specs.OpMaskedEqual,
				},
				{
					Index:    3,
					Value:    0,
					ValueTwo: 0,
					Op:       specs.OpNotEqual,
				},
			},
		},
	}

	return &specs.LinuxSeccomp{
		DefaultAction: specs.ActAllow,
		Syscalls:      syscalls,
	}
}

// StartProcess is called during stage2 after RPC server finished
// environment preparation. This is the container process itself.
// It will execute command in the fakeroot context.
//
// This will be executed as a fake root user in a new user
// namespace (PrepareConfig will set both).
func (e *EngineOperations) StartProcess(_ net.Conn) error {
	const (
		mountInfo    = "/proc/self/mountinfo"
		selinuxMount = "/sys/fs/selinux"
	)

	if e.EngineConfig == nil {
		return fmt.Errorf("bad fakeroot engine configuration provided")
	}

	args := e.EngineConfig.Args
	if len(args) == 0 {
		return fmt.Errorf("no command to execute provided")
	}
	env := e.EngineConfig.Envs

	// simple command execution
	if !e.EngineConfig.BuildEnv {
		return syscall.Exec(args[0], args, env)
	}

	// prepare fakeroot build environment
	if e.EngineConfig.Home == "" {
		return fmt.Errorf("a user home directory is required to bind it on top of /root directory")
	}

	// simple trick to bind user home directory on top of /root
	err := syscall.Mount(e.EngineConfig.Home, "/root", "", syscall.MS_BIND|syscall.MS_REC, "")
	if err != nil {
		return fmt.Errorf("failed to mount %s to /root: %s", e.EngineConfig.Home, err)
	}
	err = syscall.Mount("proc", "/proc", "proc", syscall.MS_NOSUID|syscall.MS_NOEXEC|syscall.MS_NODEV, "")
	if err != nil {
		return fmt.Errorf("failed to mount proc filesystem: %s", err)
	}

	// fix potential issue with SELinux (https://github.com/sylabs/singularity/issues/4038)
	mounts, err := proc.GetMountPointMap(mountInfo)
	if err != nil {
		return fmt.Errorf("while parsing %s: %s", mountInfo, err)
	}
	for _, m := range mounts["/sys"] {
		// In Linux <5.9 this is required so that in the chroot, selinux is seen as ro, i.e.
		// disabled, and errors getting security labels do not occur.
		// In 5.9 the remount will now fail, but it is not needed due to changes in label handling.
		if m == selinuxMount {
			flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY)
			err = syscall.Mount("", selinuxMount, "", flags, "")
			if err != nil {
				sylog.Debugf("while remount %s read-only: %s", selinuxMount, err)
				sylog.Debugf("note %s remount failure is expected on kernel 5.9+", selinuxMount)
			}
			break
		}
	}

	if seccomp.Enabled() {
		if err := seccomp.LoadSeccompConfig(fakerootSeccompProfile(), false); err != nil {
			sylog.Warningf("Could not apply seccomp filter, some bootstrap may not work correctly")
		}
	} else {
		sylog.Warningf("Not compiled with seccomp, fakeroot may not work correctly, " +
			"if you get permission denied error during creation of pseudo devices, " +
			"you should install seccomp library and recompile Singularity")
	}
	return syscall.Exec(args[0], args, env)
}

// MonitorContainer is called from master once the container has
// been spawned. It will block until the container exists.
//
// Additional privileges may be gained when running hybrid flow.
//
// Particularly here no additional privileges are gained as monitor does
// not need them for wait4 and kill syscalls.
func (e *EngineOperations) MonitorContainer(pid int, signals chan os.Signal) (syscall.WaitStatus, error) {
	var status syscall.WaitStatus
	waitStatus := make(chan syscall.WaitStatus, 1)
	waitError := make(chan error, 1)

	go func() {
		sylog.Debugf("Waiting for container process %d", pid)
		_, err := syscall.Wait4(pid, &status, 0, nil)
		sylog.Debugf("Wait for process %d complete with status %v, error %v", pid, status, err)
		waitStatus <- status
		waitError <- err
	}()

	for {
		//nolint:forcetypeassert
		select {
		case s := <-signals:
			// Signal received
			switch s {
			case syscall.SIGCHLD:
				// Our go routine waiting for the container pid will handle container exit.
				break
			case syscall.SIGURG:
				// Ignore SIGURG, which is used for non-cooperative goroutine
				// preemption starting with Go 1.14. For more information, see
				// https://github.com/golang/go/issues/24543.
				break
			default:
				//nolint:forcetypeassert
				if err := syscall.Kill(pid, s.(syscall.Signal)); err != nil {
					return status, fmt.Errorf("interrupted by signal %s", s.String())
				}
			}
		case ws := <-waitStatus:
			// Container process exited
			we := <-waitError
			if we != nil {
				return ws, fmt.Errorf("error while waiting for child: %w", we)
			}
			return ws, we
		}
	}
}

// CleanupContainer does nothing for the fakeroot engine.
func (e *EngineOperations) CleanupContainer(context.Context, error, syscall.WaitStatus) error {
	return nil
}

// PostStartProcess does nothing for the fakeroot engine.
func (e *EngineOperations) PostStartProcess(context.Context, int) error {
	return nil
}

// PostStartHost does nothing for the fakeroot engine.
func (e *EngineOperations) PostStartHost(context.Context) error {
	return nil
}

// CleanupHost does nothing for the fakeroot engine.
func (e *EngineOperations) CleanupHost(context.Context) error {
	return nil
}

func init() {
	engine.RegisterOperations(
		fakerootConfig.Name,
		&EngineOperations{
			EngineConfig: &fakerootConfig.EngineConfig{},
		},
	)
}