File: loop.go

package info (click to toggle)
apptainer 1.4.5-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 12,780 kB
  • sloc: sh: 3,329; ansic: 1,706; awk: 414; python: 103; makefile: 54
file content (447 lines) | stat: -rw-r--r-- 15,619 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
// Copyright (c) Contributors to the Apptainer project, established as
//   Apptainer a Series of LF Projects LLC.
//   For website terms of use, trademark policy, privacy policy and other
//   project policies see https://lfprojects.org/policies
// Copyright (c) 2018-2021, Sylabs Inc. All rights reserved.
// Copyright (c) 2021, Genomics plc.
// This software is licensed under a 3-clause BSD license. Please consult the
// LICENSE.md file distributed with the sources of this project regarding your
// rights to use or distribute this software.

package loop

import (
	"fmt"
	"os"
	"syscall"
	"time"

	"github.com/apptainer/apptainer/pkg/sylog"
	"github.com/apptainer/apptainer/pkg/util/fs/lock"
	"github.com/apptainer/apptainer/pkg/util/fs/proc"
	"golang.org/x/sys/unix"
)

// Device describes a loop device
type Device struct {
	MaxLoopDevices int
	Shared         bool
	Info           *unix.LoopInfo64
	fd             *int
}

// Loop control device IOCTL commands
const (
	CmdCtlAdd     = 0x4C80
	CmdCtlRemove  = 0x4C81
	CmdCtlGetFree = 0x4C82
)

// loop status retry related constants
const (
	sleepRetries      = 5
	sleepInterval     = 200 * time.Millisecond
	flushGracePeriod  = 1 * time.Second
	errStatusTryAgain = syscall.EAGAIN
)

// loop status retry function.
type retryStatusFn func(string, int) error

const (
	loopControlPath = "/dev/loop-control"
)

// create loop device function.
type createDeviceFn func(int) error

// AttachFromFile attempts to find a suitable loop device to use for the specified image.
// It runs through /dev/loopXX, up to MaxLoopDevices to find a free loop device, or
// to share a loop device already associated to file (if shared loop devices are enabled).
// If a usable loop device is found, then loop.Fd is set and no error is returned.
// If a usable loop device is not found, and this is due to a transient EAGAIN / EBUSY error,
// then it will retry up to maxRetries times, retryInterval apart, before returning an error.
func (loop *Device) AttachFromFile(image *os.File, mode int, number *int) error {
	if image == nil {
		return fmt.Errorf("empty file pointer")
	}
	fi, err := image.Stat()
	if err != nil {
		return err
	}
	imageInfo := fi.Sys().(*syscall.Stat_t)

	if loop.Shared {
		if ok, err := loop.shareLoop(imageInfo, mode, number); err != nil {
			return err
		} else if ok {
			// We found a shared loop device, and loop.Fd was set
			return nil
		}
	}

	if err := loop.attachLoop(image.Fd(), imageInfo, mode, number); err != nil {
		return fmt.Errorf("failed to attach loop device: %s", err)
	}

	return nil
}

// shareLoop runs over /dev/loopXX devices, looking for one that already has our image attached.
// If a loop device can be shared, loop.Fd is set, and ok will be true.
// If no loop device can be shared, ok will be false.
func (loop *Device) shareLoop(imageInfo *syscall.Stat_t, mode int, number *int) (ok bool, err error) {
	imageIno := imageInfo.Ino
	// cast to uint64 as st.Dev is uint32 on MIPS
	imageDev := uint64(imageInfo.Dev)

	for device := 0; device < loop.MaxLoopDevices; device++ {
		// Try to open an existing loop device, but don't create a new one
		loopFd, releaseLock, err := openLoopDev(device, mode, true, nil)
		if err != nil {
			if !os.IsNotExist(err) {
				sylog.Debugf("Couldn't open loop device %d: %s\n", device, err)
			}
			continue
		}

		status, err := GetStatusFromFd(uintptr(loopFd))
		releaseLock()
		if err != nil {
			sylog.Debugf("Couldn't get status from loop device %d: %v\n", device, err)
		} else if status.Inode == imageIno && status.Device == imageDev &&
			status.Flags&unix.LO_FLAGS_READ_ONLY == loop.Info.Flags&unix.LO_FLAGS_READ_ONLY &&
			status.Offset == loop.Info.Offset && status.Sizelimit == loop.Info.Sizelimit {
			// keep the reference to the loop device file descriptor to
			// be sure that the loop device won't be released between this
			// check and the mount of the filesystem
			sylog.Debugf("Sharing loop device %d", device)
			*number = device
			loop.fd = &loopFd
			return true, nil
		}
		syscall.Close(loopFd)
	}

	return false, nil
}

// attachLoop will find a free /dev/loopXX device, or create a new one, and attach image to it.
// For most failures with loopN, it will try loopN+1, continuing up to loop.MaxLoopDevices.
// When setting loop device status, some kernel may return EAGAIN, this function would sync
// workaround this error.
func (loop *Device) attachLoop(imageFd uintptr, imageInfo *syscall.Stat_t, mode int, number *int) error {
	releaseDevice := func(fd int, clear bool, releaseLock func()) {
		if clear {
			unix.IoctlSetInt(fd, unix.LOOP_CLR_FD, 0)
		}
		syscall.Close(fd)
		releaseLock()
	}

	createFn := getCreateDeviceFn()
	retryFn := getRetryStatusFn(imageFd, imageInfo)

	for device := 0; device < loop.MaxLoopDevices; device++ {
		// Try to open the loop device, creating the device node if needed
		loopFd, releaseLock, err := openLoopDev(device, mode, loop.Shared, createFn)
		if err != nil {
			if os.IsNotExist(err) {
				continue
			}
			sylog.Debugf("Couldn't open loop device %d: %v", device, err)
			return err
		}

		if err := unix.IoctlSetInt(loopFd, unix.LOOP_SET_FD, int(imageFd)); err != nil {
			// On error, we'll move on to try the next loop device
			releaseDevice(loopFd, false, releaseLock)
			continue
		}

		if _, _, esys := syscall.Syscall(syscall.SYS_FCNTL, uintptr(loopFd), syscall.F_SETFD, syscall.FD_CLOEXEC); esys != 0 {
			releaseDevice(loopFd, true, releaseLock)
			return fmt.Errorf("failed to set close-on-exec on loop device %s: error message=%s", getLoopPath(device), esys.Error())
		}

		if err := setLoopStatus(loopFd, loop.Info, getLoopPath(device), retryFn); err != nil {
			releaseDevice(loopFd, true, releaseLock)
			return fmt.Errorf("loop device status: %s", err)
		}

		releaseLock()
		*number = device
		loop.fd = &loopFd
		return nil
	}

	return fmt.Errorf("no loop devices available")
}

// openLoopDev will attempt to open the specified loop device number, with specified mode.
// If it is not present in /dev, and create is true, a mknod call will be used to create it.
// Returns the fd for the opened device, or -1 if it was not possible to openLoopDev it.
func openLoopDev(device, mode int, sharedLoop bool, createFn createDeviceFn) (int, func(), error) {
	path := getLoopPath(device)

	// loop device can exist but without any device attached to it in kernel,
	// a stat call couldn't catch ENXIO error in this case, use open
	loopFd, err := syscall.Open(path, mode, 0o600)
	if err != nil {
		if errno, ok := err.(syscall.Errno); ok && errno == unix.ENXIO {
			if createFn == nil {
				err = os.ErrNotExist
			}
		} else if !os.IsNotExist(err) {
			return -1, nil, fmt.Errorf("could not open %s: %w", path, err)
		}
		// device doesn't exist but no create function passed ... done
		if createFn == nil {
			return -1, nil, err
		}
		// create the device node if we need to
		err := createFn(device)
		if err != nil {
			return -1, nil, fmt.Errorf("could not create %s: %w", path, err)
		}
	} else {
		_ = syscall.Close(loopFd)
	}

	releaseLock := func() {}

	if sharedLoop {
		// there is an exclusive lock set on the opened loop device
		// when shared loop devices is in-use, this lock is intended
		// to be hold until the loop device status is set for the
		// opened loop device
		loopLock, err := lock.Exclusive(path)
		if err != nil {
			return -1, nil, fmt.Errorf("while acquiring exclusive lock on %s: %s", path, err)
		}
		releaseLock = func() {
			_ = lock.Release(loopLock)
		}
	}

	loopFd, err = syscall.Open(path, mode, 0o600)
	if err != nil {
		releaseLock()
		return -1, nil, fmt.Errorf("could not open %s: %w", path, err)
	}

	return loopFd, releaseLock, nil
}

func setLoopStatus(loopFd int, info *unix.LoopInfo64, loopDevice string, retryFn retryStatusFn) error {
	for retryCount := 0; ; retryCount++ {
		esys := unix.IoctlLoopSetStatus64(loopFd, info)
		if esys == nil {
			return nil
		} else if esys != syscall.EAGAIN {
			return fmt.Errorf("failed to set loop device status (%s): %s", loopDevice, esys)
		}

		if err := retryFn(loopDevice, retryCount); err != errStatusTryAgain {
			return err
		}
	}
}

func getRetryStatusFn(imageFd uintptr, imageInfo *syscall.Stat_t) retryStatusFn {
	return func(loopDevice string, retryCount int) error {
		// With changes introduced in https://github.com/torvalds/linux/commit/5db470e229e22b7eda6e23b5566e532c96fb5bc3
		// loop device is invalidating its cache when offset/sizelimit are modified while issuing the set status command,
		// as there is no synchronization between the invalidation and the check for cached dirty pages, some kernel may
		// return an EAGAIN error here. Note that this error is occurring very frequently with small images.
		// The first approach is to sleep and retry, the problem is that the underlying filesystem backing the image file
		// may be slow, so setting a time interval and number of retries may be hazardous, and trying other loop devices
		// just deport the issue to the next devices as falsely stated here https://dev.arvados.org/issues/18489.
		// So retry 5 times with a sleep period of 200ms between each attempt.
		if retryCount < sleepRetries {
			time.Sleep(sleepInterval)
			return errStatusTryAgain
		} else if retryCount == sleepRetries {
			// The sleeping period is over and there is remaining dirty pages in cache for the corresponding image,
			// let's use the rough approach to flush cached pages to filesystem, the kernel is not providing a way to
			// flush and wait, syncfs/fsync/sync_file_range are not working as expected here, so we call flushCache
			// which will try to issue a block device flush command when the image is located on a block device, if the
			// image is on a shared storage, a ramfs or anything else which isn't a block device, a sync syscall is
			// issued with the costs it involved.
			//
			// Dear reader, if you are not satisfied by the approach, you are invited to reproduce the issue first by using
			// an Ubuntu 18.04 distribution containing the fix/bug above and build a small image like:
			//
			// $ apptainer build /tmp/busy.sif docker://busybox
			// $ for i in $(seq 1 100); do apptainer exec /tmp/busy.sif true; done
			//
			// And search a more elegant solution
			if err := flushCache(imageFd, imageInfo); err != nil {
				return fmt.Errorf("while syncing/flushing image cache: %s", err)
			}
			return errStatusTryAgain
		} else if retryCount == sleepRetries+1 {
			// e2e tests have shown that the sync approach is not sufficient under high load
			// circumstances, therefore we are giving an additional grace period, after that
			// we are over and return a cache invalidate too slow error
			time.Sleep(flushGracePeriod)
			return errStatusTryAgain
		}

		return fmt.Errorf("failed to set loop device status (%s): cache invalidate too slow", loopDevice)
	}
}

func flushCache(_ uintptr, imageInfo *syscall.Stat_t) error {
	devStr := fmt.Sprintf("%d:%d", unix.Major(imageInfo.Dev), unix.Minor(imageInfo.Dev))
	entries, err := proc.GetMountInfoEntry("/proc/self/mountinfo")
	if err != nil {
		return fmt.Errorf("while getting mountinfo: %s", err)
	}
	for _, e := range entries {
		if e.Dev != devStr {
			continue
		}
		fi, err := os.Stat(e.Source)
		if err != nil {
			if os.IsNotExist(err) {
				continue
			}
			return fmt.Errorf("while getting information for %s: %s", e.Source, err)
		}
		// not a block device
		if fi.Mode()&os.ModeDevice == 0 {
			continue
		}
		// trigger block device flush command
		f, err := os.Open(e.Source)
		if err != nil {
			return fmt.Errorf("while opening %s: %s", e.Source, err)
		}
		defer f.Close()

		_, _, esys := syscall.Syscall(syscall.SYS_IOCTL, f.Fd(), unix.BLKFLSBUF, 0)
		if esys != 0 {
			return fmt.Errorf("while flushing block device %s: %s", e.Source, syscall.Errno(esys))
		}

		return nil
	}
	// use sync as a last resort
	unix.Sync()
	return nil
}

func getCreateDeviceFn() createDeviceFn {
	return func(device int) error {
		path := getLoopPath(device)
		// use /dev/loop-control when possible
		controlFd, err := syscall.Open(loopControlPath, syscall.O_RDWR, 0o600)
		if err != nil {
			// create loop device with mknod as a fallback
			return createLoopDevice(device)
		}
		defer syscall.Close(controlFd)

		// use an exclusive lock on /dev/loop-control
		// mainly to prevent race conditions with other
		// instances while issuing LOOP_CTL_REMOVE command
		loopControlLock, err := lock.Exclusive(loopControlPath)
		if err != nil {
			return fmt.Errorf("while acquiring exclusive lock on %s: %w", loopControlPath, err)
		}
		defer lock.Release(loopControlLock)

		for try := 0; ; try++ {
			// issue a LOOP_CTL_ADD to add the corresponding loop device
			devNum, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(controlFd), CmdCtlAdd, uintptr(device))
			if errno > 0 && errno != syscall.EEXIST {
				return fmt.Errorf("could not add device %s: %w", path, errno)
			} else if int(devNum) == device {
				if _, err := os.Stat(path); err != nil {
					if os.IsNotExist(err) {
						// handle docker container case where /dev/loop-control is available
						// but loop devices are created on /dev host, so create it in container
						return createLoopDevice(device)
					}
					return fmt.Errorf("while retrieving %s status: %s", path, err)
				}
				break
			}
			// handle a corner case where the device hasn't been created,
			// it might happen when a /dev/loopX is deleted with rm /dev/loopX
			// without issuing a LOOP_CTL_REMOVE for the corresponding device
			_, err := os.Stat(path)
			if err != nil && try == 0 {
				// issue a LOOP_CTL_REMOVE to remove the corresponding loop device in kernel
				_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(controlFd), CmdCtlRemove, uintptr(device))
				if errno > 0 {
					if errno == syscall.EBUSY {
						break
					}
					return fmt.Errorf("could not remove device %s: %w", path, errno)
				}
				// and retry to add the loop device
				continue
			} else if err != nil && try == 1 {
				return fmt.Errorf("could not add device %s: %w", path, err)
			}
			break
		}

		return nil
	}
}

// AttachFromPath finds a free loop device, opens it, and stores file descriptor
// of opened image path
func (loop *Device) AttachFromPath(image string, mode int, number *int) error {
	file, err := os.OpenFile(image, mode, 0o600)
	if err != nil {
		return err
	}
	return loop.AttachFromFile(file, mode, number)
}

// Close closes the loop device.
func (loop *Device) Close() error {
	if loop.fd != nil {
		return syscall.Close(*loop.fd)
	}
	return nil
}

// GetStatusFromFd gets info status about an opened loop device
func GetStatusFromFd(fd uintptr) (*unix.LoopInfo64, error) {
	info, err := unix.IoctlLoopGetStatus64(int(fd))
	if err != nil {
		return nil, fmt.Errorf("failed to get loop flags for loop device: %s", err)
	}
	return info, nil
}

// GetStatusFromPath gets info status about a loop device from path
func GetStatusFromPath(path string) (*unix.LoopInfo64, error) {
	loop, err := os.Open(path)
	if err != nil {
		return nil, fmt.Errorf("failed to open loop device %s: %s", path, err)
	}
	return GetStatusFromFd(loop.Fd())
}

func getLoopPath(device int) string {
	return fmt.Sprintf("/dev/loop%d", device)
}

func createLoopDevice(device int) error {
	// create loop device with mknod as a fallback
	dev := int(unix.Mkdev(uint32(7), uint32(device)))
	path := getLoopPath(device)
	esys := syscall.Mknod(path, syscall.S_IFBLK|0o660, dev)
	if errno, ok := esys.(syscall.Errno); ok {
		if errno != syscall.EEXIST {
			return fmt.Errorf("could not mknod %s: %w", path, esys)
		}
	}
	return nil
}