1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
|
package libcontainer
import (
"errors"
"fmt"
"os"
"os/exec"
"strconv"
"github.com/opencontainers/selinux/go-selinux"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
)
// linuxSetnsInit performs the container's initialization for running a new process
// inside an existing container.
type linuxSetnsInit struct {
pipe *os.File
consoleSocket *os.File
config *initConfig
logFd int
}
func (l *linuxSetnsInit) getSessionRingName() string {
return "_ses." + l.config.ContainerId
}
func (l *linuxSetnsInit) Init() error {
if !l.config.Config.NoNewKeyring {
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
return err
}
defer selinux.SetKeyLabel("") //nolint: errcheck
// Do not inherit the parent's session keyring.
if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
// Same justification as in standart_init_linux.go as to why we
// don't bail on ENOSYS.
//
// TODO(cyphar): And we should have logging here too.
if !errors.Is(err, unix.ENOSYS) {
return fmt.Errorf("unable to join session keyring: %w", err)
}
}
}
if l.config.CreateConsole {
if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return err
}
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
// Tell our parent that we're ready to exec. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return fmt.Errorf("sync ready: %w", err)
}
if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
return err
}
defer selinux.SetExecLabel("") //nolint: errcheck
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
if err != nil {
return err
}
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
}
if err := finalizeNamespace(l.config); err != nil {
return err
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
// Check for the arg before waiting to make sure it exists and it is
// returned as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// exec.LookPath in Go < 1.20 might return no error for an executable
// residing on a file system mounted with noexec flag, so perform this
// extra check now while we can still return a proper error.
// TODO: remove this once go < 1.20 is not supported.
if err := eaccess(name); err != nil {
return &os.PathError{Op: "eaccess", Path: name, Err: err}
}
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
if err != nil {
return fmt.Errorf("unable to init seccomp: %w", err)
}
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
}
logrus.Debugf("setns_init: about to exec")
// Close the log pipe fd so the parent's ForwardLogs can exit.
if err := unix.Close(l.logFd); err != nil {
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
}
// Close all file descriptors we are not passing to the container. This is
// necessary because the execve target could use internal runc fds as the
// execve path, potentially giving access to binary files from the host
// (which can then be opened by container processes, leading to container
// escapes). Note that because this operation will close any open file
// descriptors that are referenced by (*os.File) handles from underneath
// the Go runtime, we must not do any file operations after this point
// (otherwise the (*os.File) finaliser could close the wrong file). See
// CVE-2024-21626 for more information as to why this protection is
// necessary.
//
// This is not needed for runc-dmz, because the extra execve(2) step means
// that all O_CLOEXEC file descriptors have already been closed and thus
// the second execve(2) from runc-dmz cannot access internal file
// descriptors from runc.
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
return err
}
return system.Exec(name, l.config.Args[0:], os.Environ())
}
|