1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
|
// Copyright (c) Contributors to the Apptainer project, established as
// Apptainer a Series of LF Projects LLC.
// For website terms of use, trademark policy, privacy policy and other
// project policies see https://lfprojects.org/policies
// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved.
// Copyright (c) 2021, Genomics plc.
// This software is licensed under a 3-clause BSD license. Please consult the
// LICENSE.md file distributed with the sources of this project regarding your
// rights to use or distribute this software.
package loop
import (
"fmt"
"os"
"syscall"
"time"
"github.com/apptainer/apptainer/pkg/sylog"
"github.com/apptainer/apptainer/pkg/util/fs/lock"
"github.com/apptainer/apptainer/pkg/util/fs/proc"
"golang.org/x/sys/unix"
)
// Device describes a loop device
type Device struct {
MaxLoopDevices int
Shared bool
Info *unix.LoopInfo64
fd *int
}
// Loop control device IOCTL commands
const (
CmdCtlAdd = 0x4C80
CmdCtlRemove = 0x4C81
CmdCtlGetFree = 0x4C82
)
// loop status retry related constants
const (
sleepRetries = 5
sleepInterval = 200 * time.Millisecond
flushGracePeriod = 1 * time.Second
errStatusTryAgain = syscall.EAGAIN
)
// loop status retry function.
type retryStatusFn func(string, int) error
const (
loopControlPath = "/dev/loop-control"
)
// create loop device function.
type createDeviceFn func(int) error
// AttachFromFile attempts to find a suitable loop device to use for the specified image.
// It runs through /dev/loopXX, up to MaxLoopDevices to find a free loop device, or
// to share a loop device already associated to file (if shared loop devices are enabled).
// If a usable loop device is found, then loop.Fd is set and no error is returned.
// If a usable loop device is not found, and this is due to a transient EAGAIN / EBUSY error,
// then it will retry up to maxRetries times, retryInterval apart, before returning an error.
func (loop *Device) AttachFromFile(image *os.File, mode int, number *int) error {
if image == nil {
return fmt.Errorf("empty file pointer")
}
fi, err := image.Stat()
if err != nil {
return err
}
imageInfo := fi.Sys().(*syscall.Stat_t)
if loop.Shared {
if ok, err := loop.shareLoop(imageInfo, mode, number); err != nil {
return err
} else if ok {
// We found a shared loop device, and loop.Fd was set
return nil
}
}
if err := loop.attachLoop(image.Fd(), imageInfo, mode, number); err != nil {
return fmt.Errorf("failed to attach loop device: %s", err)
}
return nil
}
// shareLoop runs over /dev/loopXX devices, looking for one that already has our image attached.
// If a loop device can be shared, loop.Fd is set, and ok will be true.
// If no loop device can be shared, ok will be false.
func (loop *Device) shareLoop(imageInfo *syscall.Stat_t, mode int, number *int) (ok bool, err error) {
imageIno := imageInfo.Ino
// cast to uint64 as st.Dev is uint32 on MIPS
imageDev := uint64(imageInfo.Dev)
for device := 0; device < loop.MaxLoopDevices; device++ {
// Try to open an existing loop device, but don't create a new one
loopFd, releaseLock, err := openLoopDev(device, mode, true, nil)
if err != nil {
if !os.IsNotExist(err) {
sylog.Debugf("Couldn't open loop device %d: %s\n", device, err)
}
continue
}
status, err := GetStatusFromFd(uintptr(loopFd))
releaseLock()
if err != nil {
sylog.Debugf("Couldn't get status from loop device %d: %v\n", device, err)
} else if status.Inode == imageIno && status.Device == imageDev &&
status.Flags&unix.LO_FLAGS_READ_ONLY == loop.Info.Flags&unix.LO_FLAGS_READ_ONLY &&
status.Offset == loop.Info.Offset && status.Sizelimit == loop.Info.Sizelimit {
// keep the reference to the loop device file descriptor to
// be sure that the loop device won't be released between this
// check and the mount of the filesystem
sylog.Debugf("Sharing loop device %d", device)
*number = device
loop.fd = &loopFd
return true, nil
}
syscall.Close(loopFd)
}
return false, nil
}
// attachLoop will find a free /dev/loopXX device, or create a new one, and attach image to it.
// For most failures with loopN, it will try loopN+1, continuing up to loop.MaxLoopDevices.
// When setting loop device status, some kernel may return EAGAIN, this function would sync
// workaround this error.
func (loop *Device) attachLoop(imageFd uintptr, imageInfo *syscall.Stat_t, mode int, number *int) error {
releaseDevice := func(fd int, clear bool, releaseLock func()) {
if clear {
unix.IoctlSetInt(fd, unix.LOOP_CLR_FD, 0)
}
syscall.Close(fd)
releaseLock()
}
createFn := getCreateDeviceFn()
retryFn := getRetryStatusFn(imageFd, imageInfo)
for device := 0; device < loop.MaxLoopDevices; device++ {
// Try to open the loop device, creating the device node if needed
loopFd, releaseLock, err := openLoopDev(device, mode, loop.Shared, createFn)
if err != nil {
if os.IsNotExist(err) {
continue
}
sylog.Debugf("Couldn't open loop device %d: %v", device, err)
return err
}
if err := unix.IoctlSetInt(loopFd, unix.LOOP_SET_FD, int(imageFd)); err != nil {
// On error, we'll move on to try the next loop device
releaseDevice(loopFd, false, releaseLock)
continue
}
if _, _, esys := syscall.Syscall(syscall.SYS_FCNTL, uintptr(loopFd), syscall.F_SETFD, syscall.FD_CLOEXEC); esys != 0 {
releaseDevice(loopFd, true, releaseLock)
return fmt.Errorf("failed to set close-on-exec on loop device %s: error message=%s", getLoopPath(device), esys.Error())
}
if err := setLoopStatus(loopFd, loop.Info, getLoopPath(device), retryFn); err != nil {
releaseDevice(loopFd, true, releaseLock)
return fmt.Errorf("loop device status: %s", err)
}
releaseLock()
*number = device
loop.fd = &loopFd
return nil
}
return fmt.Errorf("no loop devices available")
}
// openLoopDev will attempt to open the specified loop device number, with specified mode.
// If it is not present in /dev, and create is true, a mknod call will be used to create it.
// Returns the fd for the opened device, or -1 if it was not possible to openLoopDev it.
func openLoopDev(device, mode int, sharedLoop bool, createFn createDeviceFn) (int, func(), error) {
path := getLoopPath(device)
// loop device can exist but without any device attached to it in kernel,
// a stat call couldn't catch ENXIO error in this case, use open
loopFd, err := syscall.Open(path, mode, 0o600)
if err != nil {
if errno, ok := err.(syscall.Errno); ok && errno == unix.ENXIO {
if createFn == nil {
err = os.ErrNotExist
}
} else if !os.IsNotExist(err) {
return -1, nil, fmt.Errorf("could not open %s: %w", path, err)
}
// device doesn't exist but no create function passed ... done
if createFn == nil {
return -1, nil, err
}
// create the device node if we need to
err := createFn(device)
if err != nil {
return -1, nil, fmt.Errorf("could not create %s: %w", path, err)
}
} else {
_ = syscall.Close(loopFd)
}
releaseLock := func() {}
if sharedLoop {
// there is an exclusive lock set on the opened loop device
// when shared loop devices is in-use, this lock is intended
// to be hold until the loop device status is set for the
// opened loop device
loopLock, err := lock.Exclusive(path)
if err != nil {
return -1, nil, fmt.Errorf("while acquiring exclusive lock on %s: %s", path, err)
}
releaseLock = func() {
_ = lock.Release(loopLock)
}
}
loopFd, err = syscall.Open(path, mode, 0o600)
if err != nil {
releaseLock()
return -1, nil, fmt.Errorf("could not open %s: %w", path, err)
}
return loopFd, releaseLock, nil
}
func setLoopStatus(loopFd int, info *unix.LoopInfo64, loopDevice string, retryFn retryStatusFn) error {
for retryCount := 0; ; retryCount++ {
esys := unix.IoctlLoopSetStatus64(loopFd, info)
if esys == nil {
return nil
} else if esys != syscall.EAGAIN {
return fmt.Errorf("failed to set loop device status (%s): %s", loopDevice, esys)
}
if err := retryFn(loopDevice, retryCount); err != errStatusTryAgain {
return err
}
}
}
func getRetryStatusFn(imageFd uintptr, imageInfo *syscall.Stat_t) retryStatusFn {
return func(loopDevice string, retryCount int) error {
// With changes introduced in https://github.com/torvalds/linux/commit/5db470e229e22b7eda6e23b5566e532c96fb5bc3
// loop device is invalidating its cache when offset/sizelimit are modified while issuing the set status command,
// as there is no synchronization between the invalidation and the check for cached dirty pages, some kernel may
// return an EAGAIN error here. Note that this error is occurring very frequently with small images.
// The first approach is to sleep and retry, the problem is that the underlying filesystem backing the image file
// may be slow, so setting a time interval and number of retries may be hazardous, and trying other loop devices
// just deport the issue to the next devices as falsely stated here https://dev.arvados.org/issues/18489.
// So retry 5 times with a sleep period of 200ms between each attempt.
if retryCount < sleepRetries {
time.Sleep(sleepInterval)
return errStatusTryAgain
} else if retryCount == sleepRetries {
// The sleeping period is over and there is remaining dirty pages in cache for the corresponding image,
// let's use the rough approach to flush cached pages to filesystem, the kernel is not providing a way to
// flush and wait, syncfs/fsync/sync_file_range are not working as expected here, so we call flushCache
// which will try to issue a block device flush command when the image is located on a block device, if the
// image is on a shared storage, a ramfs or anything else which isn't a block device, a sync syscall is
// issued with the costs it involved.
//
// Dear reader, if you are not satisfied by the approach, you are invited to reproduce the issue first by using
// an Ubuntu 18.04 distribution containing the fix/bug above and build a small image like:
//
// $ apptainer build /tmp/busy.sif docker://busybox
// $ for i in $(seq 1 100); do apptainer exec /tmp/busy.sif true; done
//
// And search a more elegant solution
if err := flushCache(imageFd, imageInfo); err != nil {
return fmt.Errorf("while syncing/flushing image cache: %s", err)
}
return errStatusTryAgain
} else if retryCount == sleepRetries+1 {
// e2e tests have shown that the sync approach is not sufficient under high load
// circumstances, therefore we are giving an additional grace period, after that
// we are over and return a cache invalidate too slow error
time.Sleep(flushGracePeriod)
return errStatusTryAgain
}
return fmt.Errorf("failed to set loop device status (%s): cache invalidate too slow", loopDevice)
}
}
func flushCache(_ uintptr, imageInfo *syscall.Stat_t) error {
devStr := fmt.Sprintf("%d:%d", unix.Major(imageInfo.Dev), unix.Minor(imageInfo.Dev))
entries, err := proc.GetMountInfoEntry("/proc/self/mountinfo")
if err != nil {
return fmt.Errorf("while getting mountinfo: %s", err)
}
for _, e := range entries {
if e.Dev != devStr {
continue
}
fi, err := os.Stat(e.Source)
if err != nil {
if os.IsNotExist(err) {
continue
}
return fmt.Errorf("while getting information for %s: %s", e.Source, err)
}
// not a block device
if fi.Mode()&os.ModeDevice == 0 {
continue
}
// trigger block device flush command
f, err := os.Open(e.Source)
if err != nil {
return fmt.Errorf("while opening %s: %s", e.Source, err)
}
defer f.Close()
_, _, esys := syscall.Syscall(syscall.SYS_IOCTL, f.Fd(), unix.BLKFLSBUF, 0)
if esys != 0 {
return fmt.Errorf("while flushing block device %s: %s", e.Source, syscall.Errno(esys))
}
return nil
}
// use sync as a last resort
unix.Sync()
return nil
}
func getCreateDeviceFn() createDeviceFn {
return func(device int) error {
path := getLoopPath(device)
// use /dev/loop-control when possible
controlFd, err := syscall.Open(loopControlPath, syscall.O_RDWR, 0o600)
if err != nil {
// create loop device with mknod as a fallback
return createLoopDevice(device)
}
defer syscall.Close(controlFd)
// use an exclusive lock on /dev/loop-control
// mainly to prevent race conditions with other
// instances while issuing LOOP_CTL_REMOVE command
loopControlLock, err := lock.Exclusive(loopControlPath)
if err != nil {
return fmt.Errorf("while acquiring exclusive lock on %s: %w", loopControlPath, err)
}
defer lock.Release(loopControlLock)
for try := 0; ; try++ {
// issue a LOOP_CTL_ADD to add the corresponding loop device
devNum, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(controlFd), CmdCtlAdd, uintptr(device))
if errno > 0 && errno != syscall.EEXIST {
return fmt.Errorf("could not add device %s: %w", path, errno)
} else if int(devNum) == device {
if _, err := os.Stat(path); err != nil {
if os.IsNotExist(err) {
// handle docker container case where /dev/loop-control is available
// but loop devices are created on /dev host, so create it in container
return createLoopDevice(device)
}
return fmt.Errorf("while retrieving %s status: %s", path, err)
}
break
}
// handle a corner case where the device hasn't been created,
// it might happen when a /dev/loopX is deleted with rm /dev/loopX
// without issuing a LOOP_CTL_REMOVE for the corresponding device
_, err := os.Stat(path)
if err != nil && try == 0 {
// issue a LOOP_CTL_REMOVE to remove the corresponding loop device in kernel
_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(controlFd), CmdCtlRemove, uintptr(device))
if errno > 0 {
if errno == syscall.EBUSY {
break
}
return fmt.Errorf("could not remove device %s: %w", path, errno)
}
// and retry to add the loop device
continue
} else if err != nil && try == 1 {
return fmt.Errorf("could not add device %s: %w", path, err)
}
break
}
return nil
}
}
// AttachFromPath finds a free loop device, opens it, and stores file descriptor
// of opened image path
func (loop *Device) AttachFromPath(image string, mode int, number *int) error {
file, err := os.OpenFile(image, mode, 0o600)
if err != nil {
return err
}
return loop.AttachFromFile(file, mode, number)
}
// Close closes the loop device.
func (loop *Device) Close() error {
if loop.fd != nil {
return syscall.Close(*loop.fd)
}
return nil
}
// GetStatusFromFd gets info status about an opened loop device
func GetStatusFromFd(fd uintptr) (*unix.LoopInfo64, error) {
info, err := unix.IoctlLoopGetStatus64(int(fd))
if err != nil {
return nil, fmt.Errorf("failed to get loop flags for loop device: %s", err)
}
return info, nil
}
// GetStatusFromPath gets info status about a loop device from path
func GetStatusFromPath(path string) (*unix.LoopInfo64, error) {
loop, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("failed to open loop device %s: %s", path, err)
}
return GetStatusFromFd(loop.Fd())
}
func getLoopPath(device int) string {
return fmt.Sprintf("/dev/loop%d", device)
}
func createLoopDevice(device int) error {
// create loop device with mknod as a fallback
dev := int(unix.Mkdev(uint32(7), uint32(device)))
path := getLoopPath(device)
esys := syscall.Mknod(path, syscall.S_IFBLK|0o660, dev)
if errno, ok := esys.(syscall.Errno); ok {
if errno != syscall.EEXIST {
return fmt.Errorf("could not mknod %s: %w", path, esys)
}
}
return nil
}
|