File: enosys_linux.go

package info (click to toggle)
runc 1.3.2%2Bds1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 2,684 kB
  • sloc: sh: 2,267; ansic: 1,125; makefile: 200
file content (745 lines) | stat: -rw-r--r-- 25,198 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
//go:build cgo && seccomp

package patchbpf

import (
	"bytes"
	"encoding/binary"
	"errors"
	"fmt"
	"io"
	"os"
	"runtime"
	"unsafe"

	libseccomp "github.com/seccomp/libseccomp-golang"
	"github.com/sirupsen/logrus"
	"golang.org/x/net/bpf"
	"golang.org/x/sys/unix"

	"github.com/opencontainers/runc/libcontainer/configs"
)

// #cgo pkg-config: libseccomp
/*
#include <errno.h>
#include <stdint.h>
#include <seccomp.h>
#include <linux/seccomp.h>

const uint32_t C_ACT_ERRNO_ENOSYS = SCMP_ACT_ERRNO(ENOSYS);

// Copied from <linux/seccomp.h>.

#ifndef SECCOMP_SET_MODE_FILTER
#	define SECCOMP_SET_MODE_FILTER 1
#endif
const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER;

#ifndef SECCOMP_FILTER_FLAG_LOG
#	define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
#endif
const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG;

#ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
#	define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
#endif
const uintptr_t C_FILTER_FLAG_SPEC_ALLOW = SECCOMP_FILTER_FLAG_SPEC_ALLOW;

#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
#	define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
#endif
const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER;

#ifndef AUDIT_ARCH_RISCV64
#ifndef EM_RISCV
#define EM_RISCV		243
#endif
#define AUDIT_ARCH_RISCV64	(EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
#endif

#ifndef AUDIT_ARCH_LOONGARCH64
#ifndef EM_LOONGARCH
#define EM_LOONGARCH		258
#endif
#define AUDIT_ARCH_LOONGARCH64	(EM_LOONGARCH|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
#endif

// We use the AUDIT_ARCH_* values because those are the ones used by the kernel
// and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we
// use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*.

const uint32_t C_AUDIT_ARCH_I386         = AUDIT_ARCH_I386;
const uint32_t C_AUDIT_ARCH_X86_64       = AUDIT_ARCH_X86_64;
const uint32_t C_AUDIT_ARCH_ARM          = AUDIT_ARCH_ARM;
const uint32_t C_AUDIT_ARCH_AARCH64      = AUDIT_ARCH_AARCH64;
const uint32_t C_AUDIT_ARCH_MIPS         = AUDIT_ARCH_MIPS;
const uint32_t C_AUDIT_ARCH_MIPS64       = AUDIT_ARCH_MIPS64;
const uint32_t C_AUDIT_ARCH_MIPS64N32    = AUDIT_ARCH_MIPS64N32;
const uint32_t C_AUDIT_ARCH_MIPSEL       = AUDIT_ARCH_MIPSEL;
const uint32_t C_AUDIT_ARCH_MIPSEL64     = AUDIT_ARCH_MIPSEL64;
const uint32_t C_AUDIT_ARCH_MIPSEL64N32  = AUDIT_ARCH_MIPSEL64N32;
const uint32_t C_AUDIT_ARCH_PPC          = AUDIT_ARCH_PPC;
const uint32_t C_AUDIT_ARCH_PPC64        = AUDIT_ARCH_PPC64;
const uint32_t C_AUDIT_ARCH_PPC64LE      = AUDIT_ARCH_PPC64LE;
const uint32_t C_AUDIT_ARCH_S390         = AUDIT_ARCH_S390;
const uint32_t C_AUDIT_ARCH_S390X        = AUDIT_ARCH_S390X;
const uint32_t C_AUDIT_ARCH_RISCV64      = AUDIT_ARCH_RISCV64;
const uint32_t C_AUDIT_ARCH_LOONGARCH64  = AUDIT_ARCH_LOONGARCH64;
*/
import "C"

var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)

// Assume sizeof(int) == 4 in the BPF program.
const bpfSizeofInt = 4

// This syscall is used for multiplexing "large" syscalls on s390(x). Unknown
// syscalls will end up with this syscall number, so we need to explicitly
// return -ENOSYS for this syscall on those architectures.
const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0

func isAllowAction(action configs.Action) bool {
	switch action {
	// Trace is considered an "allow" action because a good tracer should
	// support future syscalls (by handling -ENOSYS on its own), and giving
	// -ENOSYS will be disruptive for emulation.
	case configs.Allow, configs.Log, configs.Trace:
		return true
	default:
		return false
	}
}

func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) {
	var program []bpf.RawInstruction
	for {
		// Read the next instruction. We have to use NativeEndian because
		// seccomp_export_bpf outputs the program in *host* endian-ness.
		var insn unix.SockFilter
		if err := binary.Read(rdr, binary.NativeEndian, &insn); err != nil {
			if errors.Is(err, io.EOF) {
				// Parsing complete.
				break
			}
			if errors.Is(err, io.ErrUnexpectedEOF) {
				// Parsing stopped mid-instruction.
				return nil, fmt.Errorf("program parsing halted mid-instruction: %w", err)
			}
			// All other errors.
			return nil, fmt.Errorf("error parsing instructions: %w", err)
		}
		program = append(program, bpf.RawInstruction{
			Op: insn.Code,
			Jt: insn.Jt,
			Jf: insn.Jf,
			K:  insn.K,
		})
	}
	return program, nil
}

func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) {
	rdr, wtr, err := os.Pipe()
	if err != nil {
		return nil, fmt.Errorf("error creating scratch pipe: %w", err)
	}
	defer wtr.Close()
	defer rdr.Close()

	readerBuffer := new(bytes.Buffer)
	errChan := make(chan error, 1)
	go func() {
		_, err := io.Copy(readerBuffer, rdr)
		errChan <- err
		close(errChan)
	}()

	if err := filter.ExportBPF(wtr); err != nil {
		return nil, fmt.Errorf("error exporting BPF: %w", err)
	}
	// Close so that the reader actually gets EOF.
	_ = wtr.Close()

	if copyErr := <-errChan; copyErr != nil {
		return nil, fmt.Errorf("error reading from ExportBPF pipe: %w", copyErr)
	}

	// Parse the instructions.
	rawProgram, err := parseProgram(readerBuffer)
	if err != nil {
		return nil, fmt.Errorf("parsing generated BPF filter: %w", err)
	}
	program, ok := bpf.Disassemble(rawProgram)
	if !ok {
		return nil, errors.New("could not disassemble entire BPF filter")
	}
	return program, nil
}

type linuxAuditArch uint32

const invalidArch linuxAuditArch = 0

func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) {
	switch arch {
	case libseccomp.ArchNative:
		// Convert to actual native architecture.
		arch, err := libseccomp.GetNativeArch()
		if err != nil {
			return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
		}
		return scmpArchToAuditArch(arch)
	case libseccomp.ArchX86:
		return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil
	case libseccomp.ArchAMD64, libseccomp.ArchX32:
		// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
		//       30th bit of the syscall number set to indicate that it's not a
		//       normal x86_64 syscall.
		return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil
	case libseccomp.ArchARM:
		return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil
	case libseccomp.ArchARM64:
		return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil
	case libseccomp.ArchMIPS:
		return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil
	case libseccomp.ArchMIPS64:
		return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil
	case libseccomp.ArchMIPS64N32:
		return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil
	case libseccomp.ArchMIPSEL:
		return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil
	case libseccomp.ArchMIPSEL64:
		return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil
	case libseccomp.ArchMIPSEL64N32:
		return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
	case libseccomp.ArchPPC:
		return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil
	case libseccomp.ArchPPC64:
		return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil
	case libseccomp.ArchPPC64LE:
		return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil
	case libseccomp.ArchS390:
		return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil
	case libseccomp.ArchS390X:
		return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil
	case libseccomp.ArchRISCV64:
		return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil
	case libseccomp.ArchLOONGARCH64:
		return linuxAuditArch(C.C_AUDIT_ARCH_LOONGARCH64), nil
	default:
		return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
	}
}

type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall

// Figure out largest syscall number referenced in the filter for each
// architecture. We will be generating code based on the native architecture
// representation, but SCMP_ARCH_X32 means we have to track cases where the
// same architecture has different largest syscalls based on the mode.
func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
	scmpArchs := make(map[libseccomp.ScmpArch]struct{})
	for _, ociArch := range config.Architectures {
		arch, err := libseccomp.GetArchFromString(ociArch)
		if err != nil {
			return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
		}
		scmpArchs[arch] = struct{}{}
	}
	// On architectures like ppc64le, Docker inexplicably doesn't include the
	// native architecture in the architecture list which results in no
	// architectures being present in the list at all (rendering the ENOSYS
	// stub a no-op). So, always include the native architecture.
	if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil {
		return nil, fmt.Errorf("unable to get native arch: %w", err)
	} else if _, ok := scmpArchs[nativeScmpArch]; !ok {
		logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch)
		scmpArchs[nativeScmpArch] = struct{}{}
	}
	logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs)

	// Only loop over architectures which are present in the filter. Any other
	// architectures will get the libseccomp bad architecture action anyway.
	lastSyscalls := make(lastSyscallMap)
	for arch := range scmpArchs {
		auditArch, err := scmpArchToAuditArch(arch)
		if err != nil {
			return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
		}

		if _, ok := lastSyscalls[auditArch]; !ok {
			lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
		}
		if _, ok := lastSyscalls[auditArch][arch]; ok {
			// Because of ArchNative we may hit the same entry multiple times.
			// Just skip it if we've seen this (linuxAuditArch, ScmpArch)
			// combination before.
			continue
		}

		// Find the largest syscall in the filter for this architecture.
		var largestSyscall libseccomp.ScmpSyscall
		for _, rule := range config.Syscalls {
			sysno, err := libseccomp.GetSyscallFromNameByArch(rule.Name, arch)
			if err != nil {
				// Ignore unknown syscalls.
				continue
			}
			if sysno > largestSyscall {
				largestSyscall = sysno
			}
		}
		if largestSyscall != 0 {
			logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall)
			lastSyscalls[auditArch][arch] = largestSyscall
		} else {
			logrus.Warnf("could not find any syscalls for arch %v", arch)
			delete(lastSyscalls[auditArch], arch)
		}
	}
	return lastSyscalls, nil
}

// FIXME FIXME FIXME
//
// This solution is less than ideal. In the future it would be great to have
// per-arch information about which syscalls were added in which kernel
// versions so we can create far more accurate filter rules (handling holes in
// the syscall table and determining -ENOSYS requirements based on kernel
// minimum version alone.
//
// This implementation can in principle cause issues with syscalls like
// close_range(2) which were added out-of-order in the syscall table between
// kernel releases.
func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
	// A jump-table for each linuxAuditArch used to generate the initial
	// conditional jumps -- measured from the *END* of the program so they
	// remain valid after prepending to the tail.
	archJumpTable := map[linuxAuditArch]uint32{}

	// Generate our own -ENOSYS rules for each architecture. They have to be
	// generated in reverse (prepended to the tail of the program) because the
	// JumpIf jumps need to be computed from the end of the program.
	programTail := []bpf.Instruction{
		// Fall-through rules jump into the filter.
		bpf.Jump{Skip: 1},
		// Rules which jump to here get -ENOSYS.
		bpf.RetConstant{Val: retErrnoEnosys},
	}

	// Generate the syscall -ENOSYS rules.
	for auditArch, maxSyscalls := range lastSyscalls {
		// The number of instructions from the tail of this section which need
		// to be jumped in order to reach the -ENOSYS return. If the section
		// does not jump, it will fall through to the actual filter.
		baseJumpEnosys := uint32(len(programTail) - 1)
		baseJumpFilter := baseJumpEnosys + 1

		// Add the load instruction for the syscall number -- we jump here
		// directly from the arch code so we need to do it here. Sadly we can't
		// share this code between architecture branches.
		section := []bpf.Instruction{
			// load [0] (syscall number)
			bpf.LoadAbsolute{Off: 0, Size: bpfSizeofInt},
		}

		switch len(maxSyscalls) {
		case 0:
			// No syscalls found for this arch -- skip it and move on.
			continue
		case 1:
			// Get the only syscall and scmpArch in the map.
			var (
				scmpArch libseccomp.ScmpArch
				sysno    libseccomp.ScmpSyscall
			)
			for arch, no := range maxSyscalls {
				sysno = no
				scmpArch = arch
			}

			switch scmpArch {
			// Return -ENOSYS for setup(2) on s390(x). This syscall is used for
			// multiplexing "large syscall number" syscalls, but if the syscall
			// number is not known to the kernel then the syscall number is
			// left unchanged (and because it is sysno=0, you'll end up with
			// EPERM for syscalls the kernel doesn't know about).
			//
			// The actual setup(2) syscall is never used by userspace anymore
			// (and hasn't existed for decades) outside of this multiplexing
			// scheme so returning -ENOSYS is fine.
			case libseccomp.ArchS390, libseccomp.ArchS390X:
				section = append(section, []bpf.Instruction{
					// jne [setup=0],1
					bpf.JumpIf{
						Cond:     bpf.JumpNotEqual,
						Val:      uint32(s390xMultiplexSyscall),
						SkipTrue: 1,
					},
					// ret [ENOSYS]
					bpf.RetConstant{Val: retErrnoEnosys},
				}...)
			}

			// The simplest case just boils down to a single jgt instruction,
			// with special handling if baseJumpEnosys is larger than 255 (and
			// thus a long jump is required).
			var sectionTail []bpf.Instruction
			if baseJumpEnosys+1 <= 255 {
				sectionTail = []bpf.Instruction{
					// jgt [syscall],[baseJumpEnosys+1]
					bpf.JumpIf{
						Cond:     bpf.JumpGreaterThan,
						Val:      uint32(sysno),
						SkipTrue: uint8(baseJumpEnosys + 1),
					},
					// ja [baseJumpFilter]
					bpf.Jump{Skip: baseJumpFilter},
				}
			} else {
				sectionTail = []bpf.Instruction{
					// jle [syscall],1
					bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1},
					// ret [ENOSYS]
					bpf.RetConstant{Val: retErrnoEnosys},
					// ja [baseJumpFilter]
					bpf.Jump{Skip: baseJumpFilter},
				}
			}

			// If we're on x86 we need to add a check for x32 and if we're in
			// the wrong mode we jump over the section.
			if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
				// Generate a prefix to check the mode.
				switch scmpArch {
				case libseccomp.ArchAMD64:
					sectionTail = append([]bpf.Instruction{
						// jset (1<<30),[len(tail)-1]
						bpf.JumpIf{
							Cond:     bpf.JumpBitsSet,
							Val:      1 << 30,
							SkipTrue: uint8(len(sectionTail) - 1),
						},
					}, sectionTail...)
				case libseccomp.ArchX32:
					sectionTail = append([]bpf.Instruction{
						// jset (1<<30),0,[len(tail)-1]
						bpf.JumpIf{
							Cond:     bpf.JumpBitsNotSet,
							Val:      1 << 30,
							SkipTrue: uint8(len(sectionTail) - 1),
						},
					}, sectionTail...)
				default:
					return nil, fmt.Errorf("unknown amd64 native architecture %#x", scmpArch)
				}
			}

			section = append(section, sectionTail...)
		case 2:
			// x32 and x86_64 are a unique case, we can't handle any others.
			if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
				return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch)
			}

			x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
			if !ok {
				return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls)
			}
			x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64]
			if !ok {
				return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls)
			}

			// The x32 ABI indicates that a syscall is being made by an x32
			// process by setting the 30th bit of the syscall number, but we
			// need to do some special-casing depending on whether we need to
			// do long jumps.
			if baseJumpEnosys+2 <= 255 {
				// For the simple case we want to have something like:
				//   jset (1<<30),1
				//   jgt [x86 syscall],[baseJumpEnosys+2],1
				//   jgt [x32 syscall],[baseJumpEnosys+1]
				//   ja [baseJumpFilter]
				section = append(section, []bpf.Instruction{
					// jset (1<<30),1
					bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
					// jgt [x86 syscall],[baseJumpEnosys+1],1
					bpf.JumpIf{
						Cond:     bpf.JumpGreaterThan,
						Val:      uint32(x86sysno),
						SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1,
					},
					// jgt [x32 syscall],[baseJumpEnosys]
					bpf.JumpIf{
						Cond:     bpf.JumpGreaterThan,
						Val:      uint32(x32sysno),
						SkipTrue: uint8(baseJumpEnosys + 1),
					},
					// ja [baseJumpFilter]
					bpf.Jump{Skip: baseJumpFilter},
				}...)
			} else {
				// But if the [baseJumpEnosys+2] jump is larger than 255 we
				// need to do a long jump like so:
				//   jset (1<<30),1
				//   jgt [x86 syscall],1,2
				//   jle [x32 syscall],1
				//   ret [ENOSYS]
				//   ja [baseJumpFilter]
				section = append(section, []bpf.Instruction{
					// jset (1<<30),1
					bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
					// jgt [x86 syscall],1,2
					bpf.JumpIf{
						Cond:     bpf.JumpGreaterThan,
						Val:      uint32(x86sysno),
						SkipTrue: 1, SkipFalse: 2,
					},
					// jle [x32 syscall],1
					bpf.JumpIf{
						Cond:     bpf.JumpLessOrEqual,
						Val:      uint32(x32sysno),
						SkipTrue: 1,
					},
					// ret [ENOSYS]
					bpf.RetConstant{Val: retErrnoEnosys},
					// ja [baseJumpFilter]
					bpf.Jump{Skip: baseJumpFilter},
				}...)
			}
		default:
			return nil, fmt.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls))
		}

		// Prepend this section to the tail.
		programTail = append(section, programTail...)

		// Update jump table.
		archJumpTable[auditArch] = uint32(len(programTail))
	}

	// Add a dummy "jump to filter" for any architecture we might miss below.
	// Such architectures will probably get the BadArch action of the filter
	// regardless.
	programTail = append([]bpf.Instruction{
		// ja [end of stub and start of filter]
		bpf.Jump{Skip: uint32(len(programTail))},
	}, programTail...)

	// Generate the jump rules for each architecture. This has to be done in
	// reverse as well for the same reason as above. We add to programTail
	// directly because the jumps are impacted by each architecture rule we add
	// as well.
	//
	// TODO: Maybe we want to optimise to avoid long jumps here? So sort the
	//       architectures based on how large the jumps are going to be, or
	//       re-sort the candidate architectures each time to make sure that we
	//       pick the largest jump which is going to be smaller than 255.
	for auditArch := range lastSyscalls {
		// We jump forwards but the jump table is calculated from the *END*.
		jump := uint32(len(programTail)) - archJumpTable[auditArch]

		// Same routine as above -- this is a basic jeq check, complicated
		// slightly if it turns out that we need to do a long jump.
		if jump <= 255 {
			programTail = append([]bpf.Instruction{
				// jeq [arch],[jump]
				bpf.JumpIf{
					Cond:     bpf.JumpEqual,
					Val:      uint32(auditArch),
					SkipTrue: uint8(jump),
				},
			}, programTail...)
		} else {
			programTail = append([]bpf.Instruction{
				// jne [arch],1
				bpf.JumpIf{
					Cond:     bpf.JumpNotEqual,
					Val:      uint32(auditArch),
					SkipTrue: 1,
				},
				// ja [jump]
				bpf.Jump{Skip: jump},
			}, programTail...)
		}
	}

	// Prepend the load instruction for the architecture.
	programTail = append([]bpf.Instruction{
		// load [4] (architecture)
		bpf.LoadAbsolute{Off: bpfSizeofInt, Size: bpfSizeofInt},
	}, programTail...)

	// And that's all folks!
	return programTail, nil
}

func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) {
	rawProgram, err := bpf.Assemble(program)
	if err != nil {
		return nil, fmt.Errorf("error assembling program: %w", err)
	}

	// Convert to []unix.SockFilter for unix.SockFilter.
	var filter []unix.SockFilter
	for _, insn := range rawProgram {
		filter = append(filter, unix.SockFilter{
			Code: insn.Op,
			Jt:   insn.Jt,
			Jf:   insn.Jf,
			K:    insn.K,
		})
	}
	return filter, nil
}

func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) {
	// Patch the generated cBPF only when there is not a defaultErrnoRet set
	// and it is different from ENOSYS
	if config.DefaultErrnoRet != nil && *config.DefaultErrnoRet == uint(retErrnoEnosys) {
		return nil, nil
	}
	// We only add the stub if the default action is not permissive.
	if isAllowAction(config.DefaultAction) {
		logrus.Debugf("seccomp: skipping -ENOSYS stub filter generation")
		return nil, nil
	}

	lastSyscalls, err := findLastSyscalls(config)
	if err != nil {
		return nil, fmt.Errorf("error finding last syscalls for -ENOSYS stub: %w", err)
	}
	stubProgram, err := generateEnosysStub(lastSyscalls)
	if err != nil {
		return nil, fmt.Errorf("error generating -ENOSYS stub: %w", err)
	}
	return stubProgram, nil
}

func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) {
	program, err := disassembleFilter(filter)
	if err != nil {
		return nil, fmt.Errorf("error disassembling original filter: %w", err)
	}

	patch, err := generatePatch(config)
	if err != nil {
		return nil, fmt.Errorf("error generating patch for filter: %w", err)
	}
	fullProgram := append(patch, program...)

	logrus.Debugf("seccomp: prepending -ENOSYS stub filter to user filter...")
	for idx, insn := range patch {
		logrus.Debugf("  [%4.1d] %s", idx, insn)
	}
	logrus.Debugf("  [....] --- original filter ---")

	fprog, err := assemble(fullProgram)
	if err != nil {
		return nil, fmt.Errorf("error assembling modified filter: %w", err)
	}
	return fprog, nil
}

func filterFlags(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) {
	// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
	apiLevel, _ := libseccomp.GetAPI()

	noNewPrivs, err = filter.GetNoNewPrivsBit()
	if err != nil {
		return 0, false, fmt.Errorf("unable to fetch no_new_privs filter bit: %w", err)
	}

	if apiLevel >= 3 {
		if logBit, err := filter.GetLogBit(); err != nil {
			return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_LOG bit: %w", err)
		} else if logBit {
			flags |= uint(C.C_FILTER_FLAG_LOG)
		}
	}
	if apiLevel >= 4 {
		if ssb, err := filter.GetSSB(); err != nil {
			return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_SPEC_ALLOW bit: %w", err)
		} else if ssb {
			flags |= uint(C.C_FILTER_FLAG_SPEC_ALLOW)
		}
	}
	// XXX: add newly supported filter flags above this line.

	for _, call := range config.Syscalls {
		if call.Action == configs.Notify {
			flags |= uint(C.C_FILTER_FLAG_NEW_LISTENER)
			break
		}
	}

	return
}

func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err error) {
	// This debug output is validated in tests/integration/seccomp.bats
	// by the SECCOMP_FILTER_FLAG_* test.
	logrus.Debugf("seccomp filter flags: %d", flags)
	fprog := unix.SockFprog{
		Len:    uint16(len(filter)),
		Filter: &filter[0],
	}
	fd = -1 // only return a valid fd when C_FILTER_FLAG_NEW_LISTENER is set
	// If no seccomp flags were requested we can use the old-school prctl(2).
	if flags == 0 {
		err = unix.Prctl(unix.PR_SET_SECCOMP,
			unix.SECCOMP_MODE_FILTER,
			uintptr(unsafe.Pointer(&fprog)), 0, 0)
	} else {
		fdptr, _, errno := unix.RawSyscall(unix.SYS_SECCOMP,
			uintptr(C.C_SET_MODE_FILTER),
			uintptr(flags), uintptr(unsafe.Pointer(&fprog)))
		if errno != 0 {
			err = errno
		}
		if flags&uint(C.C_FILTER_FLAG_NEW_LISTENER) != 0 {
			fd = int(fdptr)
		}
	}
	runtime.KeepAlive(filter)
	runtime.KeepAlive(fprog)
	return
}

// PatchAndLoad takes a seccomp configuration and a libseccomp filter which has
// been pre-configured with the set of rules in the seccomp config. It then
// patches said filter to handle -ENOSYS in a much nicer manner than the
// default libseccomp default action behaviour, and loads the patched filter
// into the kernel for the current process.
func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) {
	// Generate a patched filter.
	fprog, err := enosysPatchFilter(config, filter)
	if err != nil {
		return -1, fmt.Errorf("error patching filter: %w", err)
	}

	// Get the set of libseccomp flags set.
	seccompFlags, noNewPrivs, err := filterFlags(config, filter)
	if err != nil {
		return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err)
	}

	// Set no_new_privs if it was requested, though in runc we handle
	// no_new_privs separately so warn if we hit this path.
	if noNewPrivs {
		logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path")
		if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
			return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err)
		}
	}

	// Finally, load the filter.
	fd, err := sysSeccompSetFilter(seccompFlags, fprog)
	if err != nil {
		return -1, fmt.Errorf("error loading seccomp filter: %w", err)
	}

	return fd, nil
}