1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745
|
//go:build cgo && seccomp
package patchbpf
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"os"
"runtime"
"unsafe"
libseccomp "github.com/seccomp/libseccomp-golang"
"github.com/sirupsen/logrus"
"golang.org/x/net/bpf"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/configs"
)
// #cgo pkg-config: libseccomp
/*
#include <errno.h>
#include <stdint.h>
#include <seccomp.h>
#include <linux/seccomp.h>
const uint32_t C_ACT_ERRNO_ENOSYS = SCMP_ACT_ERRNO(ENOSYS);
// Copied from <linux/seccomp.h>.
#ifndef SECCOMP_SET_MODE_FILTER
# define SECCOMP_SET_MODE_FILTER 1
#endif
const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER;
#ifndef SECCOMP_FILTER_FLAG_LOG
# define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
#endif
const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG;
#ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
# define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
#endif
const uintptr_t C_FILTER_FLAG_SPEC_ALLOW = SECCOMP_FILTER_FLAG_SPEC_ALLOW;
#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
# define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
#endif
const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER;
#ifndef AUDIT_ARCH_RISCV64
#ifndef EM_RISCV
#define EM_RISCV 243
#endif
#define AUDIT_ARCH_RISCV64 (EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
#endif
#ifndef AUDIT_ARCH_LOONGARCH64
#ifndef EM_LOONGARCH
#define EM_LOONGARCH 258
#endif
#define AUDIT_ARCH_LOONGARCH64 (EM_LOONGARCH|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
#endif
// We use the AUDIT_ARCH_* values because those are the ones used by the kernel
// and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we
// use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*.
const uint32_t C_AUDIT_ARCH_I386 = AUDIT_ARCH_I386;
const uint32_t C_AUDIT_ARCH_X86_64 = AUDIT_ARCH_X86_64;
const uint32_t C_AUDIT_ARCH_ARM = AUDIT_ARCH_ARM;
const uint32_t C_AUDIT_ARCH_AARCH64 = AUDIT_ARCH_AARCH64;
const uint32_t C_AUDIT_ARCH_MIPS = AUDIT_ARCH_MIPS;
const uint32_t C_AUDIT_ARCH_MIPS64 = AUDIT_ARCH_MIPS64;
const uint32_t C_AUDIT_ARCH_MIPS64N32 = AUDIT_ARCH_MIPS64N32;
const uint32_t C_AUDIT_ARCH_MIPSEL = AUDIT_ARCH_MIPSEL;
const uint32_t C_AUDIT_ARCH_MIPSEL64 = AUDIT_ARCH_MIPSEL64;
const uint32_t C_AUDIT_ARCH_MIPSEL64N32 = AUDIT_ARCH_MIPSEL64N32;
const uint32_t C_AUDIT_ARCH_PPC = AUDIT_ARCH_PPC;
const uint32_t C_AUDIT_ARCH_PPC64 = AUDIT_ARCH_PPC64;
const uint32_t C_AUDIT_ARCH_PPC64LE = AUDIT_ARCH_PPC64LE;
const uint32_t C_AUDIT_ARCH_S390 = AUDIT_ARCH_S390;
const uint32_t C_AUDIT_ARCH_S390X = AUDIT_ARCH_S390X;
const uint32_t C_AUDIT_ARCH_RISCV64 = AUDIT_ARCH_RISCV64;
const uint32_t C_AUDIT_ARCH_LOONGARCH64 = AUDIT_ARCH_LOONGARCH64;
*/
import "C"
var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)
// Assume sizeof(int) == 4 in the BPF program.
const bpfSizeofInt = 4
// This syscall is used for multiplexing "large" syscalls on s390(x). Unknown
// syscalls will end up with this syscall number, so we need to explicitly
// return -ENOSYS for this syscall on those architectures.
const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0
func isAllowAction(action configs.Action) bool {
switch action {
// Trace is considered an "allow" action because a good tracer should
// support future syscalls (by handling -ENOSYS on its own), and giving
// -ENOSYS will be disruptive for emulation.
case configs.Allow, configs.Log, configs.Trace:
return true
default:
return false
}
}
func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) {
var program []bpf.RawInstruction
for {
// Read the next instruction. We have to use NativeEndian because
// seccomp_export_bpf outputs the program in *host* endian-ness.
var insn unix.SockFilter
if err := binary.Read(rdr, binary.NativeEndian, &insn); err != nil {
if errors.Is(err, io.EOF) {
// Parsing complete.
break
}
if errors.Is(err, io.ErrUnexpectedEOF) {
// Parsing stopped mid-instruction.
return nil, fmt.Errorf("program parsing halted mid-instruction: %w", err)
}
// All other errors.
return nil, fmt.Errorf("error parsing instructions: %w", err)
}
program = append(program, bpf.RawInstruction{
Op: insn.Code,
Jt: insn.Jt,
Jf: insn.Jf,
K: insn.K,
})
}
return program, nil
}
func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) {
rdr, wtr, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("error creating scratch pipe: %w", err)
}
defer wtr.Close()
defer rdr.Close()
readerBuffer := new(bytes.Buffer)
errChan := make(chan error, 1)
go func() {
_, err := io.Copy(readerBuffer, rdr)
errChan <- err
close(errChan)
}()
if err := filter.ExportBPF(wtr); err != nil {
return nil, fmt.Errorf("error exporting BPF: %w", err)
}
// Close so that the reader actually gets EOF.
_ = wtr.Close()
if copyErr := <-errChan; copyErr != nil {
return nil, fmt.Errorf("error reading from ExportBPF pipe: %w", copyErr)
}
// Parse the instructions.
rawProgram, err := parseProgram(readerBuffer)
if err != nil {
return nil, fmt.Errorf("parsing generated BPF filter: %w", err)
}
program, ok := bpf.Disassemble(rawProgram)
if !ok {
return nil, errors.New("could not disassemble entire BPF filter")
}
return program, nil
}
type linuxAuditArch uint32
const invalidArch linuxAuditArch = 0
func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) {
switch arch {
case libseccomp.ArchNative:
// Convert to actual native architecture.
arch, err := libseccomp.GetNativeArch()
if err != nil {
return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
}
return scmpArchToAuditArch(arch)
case libseccomp.ArchX86:
return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil
case libseccomp.ArchAMD64, libseccomp.ArchX32:
// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
// 30th bit of the syscall number set to indicate that it's not a
// normal x86_64 syscall.
return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil
case libseccomp.ArchARM:
return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil
case libseccomp.ArchARM64:
return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil
case libseccomp.ArchMIPS:
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil
case libseccomp.ArchMIPS64:
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil
case libseccomp.ArchMIPS64N32:
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil
case libseccomp.ArchMIPSEL:
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil
case libseccomp.ArchMIPSEL64:
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil
case libseccomp.ArchMIPSEL64N32:
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
case libseccomp.ArchPPC:
return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil
case libseccomp.ArchPPC64:
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil
case libseccomp.ArchPPC64LE:
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil
case libseccomp.ArchS390:
return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil
case libseccomp.ArchS390X:
return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil
case libseccomp.ArchRISCV64:
return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil
case libseccomp.ArchLOONGARCH64:
return linuxAuditArch(C.C_AUDIT_ARCH_LOONGARCH64), nil
default:
return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
}
}
type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
// Figure out largest syscall number referenced in the filter for each
// architecture. We will be generating code based on the native architecture
// representation, but SCMP_ARCH_X32 means we have to track cases where the
// same architecture has different largest syscalls based on the mode.
func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
scmpArchs := make(map[libseccomp.ScmpArch]struct{})
for _, ociArch := range config.Architectures {
arch, err := libseccomp.GetArchFromString(ociArch)
if err != nil {
return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
}
scmpArchs[arch] = struct{}{}
}
// On architectures like ppc64le, Docker inexplicably doesn't include the
// native architecture in the architecture list which results in no
// architectures being present in the list at all (rendering the ENOSYS
// stub a no-op). So, always include the native architecture.
if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil {
return nil, fmt.Errorf("unable to get native arch: %w", err)
} else if _, ok := scmpArchs[nativeScmpArch]; !ok {
logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch)
scmpArchs[nativeScmpArch] = struct{}{}
}
logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs)
// Only loop over architectures which are present in the filter. Any other
// architectures will get the libseccomp bad architecture action anyway.
lastSyscalls := make(lastSyscallMap)
for arch := range scmpArchs {
auditArch, err := scmpArchToAuditArch(arch)
if err != nil {
return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
}
if _, ok := lastSyscalls[auditArch]; !ok {
lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
}
if _, ok := lastSyscalls[auditArch][arch]; ok {
// Because of ArchNative we may hit the same entry multiple times.
// Just skip it if we've seen this (linuxAuditArch, ScmpArch)
// combination before.
continue
}
// Find the largest syscall in the filter for this architecture.
var largestSyscall libseccomp.ScmpSyscall
for _, rule := range config.Syscalls {
sysno, err := libseccomp.GetSyscallFromNameByArch(rule.Name, arch)
if err != nil {
// Ignore unknown syscalls.
continue
}
if sysno > largestSyscall {
largestSyscall = sysno
}
}
if largestSyscall != 0 {
logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall)
lastSyscalls[auditArch][arch] = largestSyscall
} else {
logrus.Warnf("could not find any syscalls for arch %v", arch)
delete(lastSyscalls[auditArch], arch)
}
}
return lastSyscalls, nil
}
// FIXME FIXME FIXME
//
// This solution is less than ideal. In the future it would be great to have
// per-arch information about which syscalls were added in which kernel
// versions so we can create far more accurate filter rules (handling holes in
// the syscall table and determining -ENOSYS requirements based on kernel
// minimum version alone.
//
// This implementation can in principle cause issues with syscalls like
// close_range(2) which were added out-of-order in the syscall table between
// kernel releases.
func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
// A jump-table for each linuxAuditArch used to generate the initial
// conditional jumps -- measured from the *END* of the program so they
// remain valid after prepending to the tail.
archJumpTable := map[linuxAuditArch]uint32{}
// Generate our own -ENOSYS rules for each architecture. They have to be
// generated in reverse (prepended to the tail of the program) because the
// JumpIf jumps need to be computed from the end of the program.
programTail := []bpf.Instruction{
// Fall-through rules jump into the filter.
bpf.Jump{Skip: 1},
// Rules which jump to here get -ENOSYS.
bpf.RetConstant{Val: retErrnoEnosys},
}
// Generate the syscall -ENOSYS rules.
for auditArch, maxSyscalls := range lastSyscalls {
// The number of instructions from the tail of this section which need
// to be jumped in order to reach the -ENOSYS return. If the section
// does not jump, it will fall through to the actual filter.
baseJumpEnosys := uint32(len(programTail) - 1)
baseJumpFilter := baseJumpEnosys + 1
// Add the load instruction for the syscall number -- we jump here
// directly from the arch code so we need to do it here. Sadly we can't
// share this code between architecture branches.
section := []bpf.Instruction{
// load [0] (syscall number)
bpf.LoadAbsolute{Off: 0, Size: bpfSizeofInt},
}
switch len(maxSyscalls) {
case 0:
// No syscalls found for this arch -- skip it and move on.
continue
case 1:
// Get the only syscall and scmpArch in the map.
var (
scmpArch libseccomp.ScmpArch
sysno libseccomp.ScmpSyscall
)
for arch, no := range maxSyscalls {
sysno = no
scmpArch = arch
}
switch scmpArch {
// Return -ENOSYS for setup(2) on s390(x). This syscall is used for
// multiplexing "large syscall number" syscalls, but if the syscall
// number is not known to the kernel then the syscall number is
// left unchanged (and because it is sysno=0, you'll end up with
// EPERM for syscalls the kernel doesn't know about).
//
// The actual setup(2) syscall is never used by userspace anymore
// (and hasn't existed for decades) outside of this multiplexing
// scheme so returning -ENOSYS is fine.
case libseccomp.ArchS390, libseccomp.ArchS390X:
section = append(section, []bpf.Instruction{
// jne [setup=0],1
bpf.JumpIf{
Cond: bpf.JumpNotEqual,
Val: uint32(s390xMultiplexSyscall),
SkipTrue: 1,
},
// ret [ENOSYS]
bpf.RetConstant{Val: retErrnoEnosys},
}...)
}
// The simplest case just boils down to a single jgt instruction,
// with special handling if baseJumpEnosys is larger than 255 (and
// thus a long jump is required).
var sectionTail []bpf.Instruction
if baseJumpEnosys+1 <= 255 {
sectionTail = []bpf.Instruction{
// jgt [syscall],[baseJumpEnosys+1]
bpf.JumpIf{
Cond: bpf.JumpGreaterThan,
Val: uint32(sysno),
SkipTrue: uint8(baseJumpEnosys + 1),
},
// ja [baseJumpFilter]
bpf.Jump{Skip: baseJumpFilter},
}
} else {
sectionTail = []bpf.Instruction{
// jle [syscall],1
bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1},
// ret [ENOSYS]
bpf.RetConstant{Val: retErrnoEnosys},
// ja [baseJumpFilter]
bpf.Jump{Skip: baseJumpFilter},
}
}
// If we're on x86 we need to add a check for x32 and if we're in
// the wrong mode we jump over the section.
if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
// Generate a prefix to check the mode.
switch scmpArch {
case libseccomp.ArchAMD64:
sectionTail = append([]bpf.Instruction{
// jset (1<<30),[len(tail)-1]
bpf.JumpIf{
Cond: bpf.JumpBitsSet,
Val: 1 << 30,
SkipTrue: uint8(len(sectionTail) - 1),
},
}, sectionTail...)
case libseccomp.ArchX32:
sectionTail = append([]bpf.Instruction{
// jset (1<<30),0,[len(tail)-1]
bpf.JumpIf{
Cond: bpf.JumpBitsNotSet,
Val: 1 << 30,
SkipTrue: uint8(len(sectionTail) - 1),
},
}, sectionTail...)
default:
return nil, fmt.Errorf("unknown amd64 native architecture %#x", scmpArch)
}
}
section = append(section, sectionTail...)
case 2:
// x32 and x86_64 are a unique case, we can't handle any others.
if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch)
}
x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
if !ok {
return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls)
}
x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64]
if !ok {
return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls)
}
// The x32 ABI indicates that a syscall is being made by an x32
// process by setting the 30th bit of the syscall number, but we
// need to do some special-casing depending on whether we need to
// do long jumps.
if baseJumpEnosys+2 <= 255 {
// For the simple case we want to have something like:
// jset (1<<30),1
// jgt [x86 syscall],[baseJumpEnosys+2],1
// jgt [x32 syscall],[baseJumpEnosys+1]
// ja [baseJumpFilter]
section = append(section, []bpf.Instruction{
// jset (1<<30),1
bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
// jgt [x86 syscall],[baseJumpEnosys+1],1
bpf.JumpIf{
Cond: bpf.JumpGreaterThan,
Val: uint32(x86sysno),
SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1,
},
// jgt [x32 syscall],[baseJumpEnosys]
bpf.JumpIf{
Cond: bpf.JumpGreaterThan,
Val: uint32(x32sysno),
SkipTrue: uint8(baseJumpEnosys + 1),
},
// ja [baseJumpFilter]
bpf.Jump{Skip: baseJumpFilter},
}...)
} else {
// But if the [baseJumpEnosys+2] jump is larger than 255 we
// need to do a long jump like so:
// jset (1<<30),1
// jgt [x86 syscall],1,2
// jle [x32 syscall],1
// ret [ENOSYS]
// ja [baseJumpFilter]
section = append(section, []bpf.Instruction{
// jset (1<<30),1
bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
// jgt [x86 syscall],1,2
bpf.JumpIf{
Cond: bpf.JumpGreaterThan,
Val: uint32(x86sysno),
SkipTrue: 1, SkipFalse: 2,
},
// jle [x32 syscall],1
bpf.JumpIf{
Cond: bpf.JumpLessOrEqual,
Val: uint32(x32sysno),
SkipTrue: 1,
},
// ret [ENOSYS]
bpf.RetConstant{Val: retErrnoEnosys},
// ja [baseJumpFilter]
bpf.Jump{Skip: baseJumpFilter},
}...)
}
default:
return nil, fmt.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls))
}
// Prepend this section to the tail.
programTail = append(section, programTail...)
// Update jump table.
archJumpTable[auditArch] = uint32(len(programTail))
}
// Add a dummy "jump to filter" for any architecture we might miss below.
// Such architectures will probably get the BadArch action of the filter
// regardless.
programTail = append([]bpf.Instruction{
// ja [end of stub and start of filter]
bpf.Jump{Skip: uint32(len(programTail))},
}, programTail...)
// Generate the jump rules for each architecture. This has to be done in
// reverse as well for the same reason as above. We add to programTail
// directly because the jumps are impacted by each architecture rule we add
// as well.
//
// TODO: Maybe we want to optimise to avoid long jumps here? So sort the
// architectures based on how large the jumps are going to be, or
// re-sort the candidate architectures each time to make sure that we
// pick the largest jump which is going to be smaller than 255.
for auditArch := range lastSyscalls {
// We jump forwards but the jump table is calculated from the *END*.
jump := uint32(len(programTail)) - archJumpTable[auditArch]
// Same routine as above -- this is a basic jeq check, complicated
// slightly if it turns out that we need to do a long jump.
if jump <= 255 {
programTail = append([]bpf.Instruction{
// jeq [arch],[jump]
bpf.JumpIf{
Cond: bpf.JumpEqual,
Val: uint32(auditArch),
SkipTrue: uint8(jump),
},
}, programTail...)
} else {
programTail = append([]bpf.Instruction{
// jne [arch],1
bpf.JumpIf{
Cond: bpf.JumpNotEqual,
Val: uint32(auditArch),
SkipTrue: 1,
},
// ja [jump]
bpf.Jump{Skip: jump},
}, programTail...)
}
}
// Prepend the load instruction for the architecture.
programTail = append([]bpf.Instruction{
// load [4] (architecture)
bpf.LoadAbsolute{Off: bpfSizeofInt, Size: bpfSizeofInt},
}, programTail...)
// And that's all folks!
return programTail, nil
}
func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) {
rawProgram, err := bpf.Assemble(program)
if err != nil {
return nil, fmt.Errorf("error assembling program: %w", err)
}
// Convert to []unix.SockFilter for unix.SockFilter.
var filter []unix.SockFilter
for _, insn := range rawProgram {
filter = append(filter, unix.SockFilter{
Code: insn.Op,
Jt: insn.Jt,
Jf: insn.Jf,
K: insn.K,
})
}
return filter, nil
}
func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) {
// Patch the generated cBPF only when there is not a defaultErrnoRet set
// and it is different from ENOSYS
if config.DefaultErrnoRet != nil && *config.DefaultErrnoRet == uint(retErrnoEnosys) {
return nil, nil
}
// We only add the stub if the default action is not permissive.
if isAllowAction(config.DefaultAction) {
logrus.Debugf("seccomp: skipping -ENOSYS stub filter generation")
return nil, nil
}
lastSyscalls, err := findLastSyscalls(config)
if err != nil {
return nil, fmt.Errorf("error finding last syscalls for -ENOSYS stub: %w", err)
}
stubProgram, err := generateEnosysStub(lastSyscalls)
if err != nil {
return nil, fmt.Errorf("error generating -ENOSYS stub: %w", err)
}
return stubProgram, nil
}
func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) {
program, err := disassembleFilter(filter)
if err != nil {
return nil, fmt.Errorf("error disassembling original filter: %w", err)
}
patch, err := generatePatch(config)
if err != nil {
return nil, fmt.Errorf("error generating patch for filter: %w", err)
}
fullProgram := append(patch, program...)
logrus.Debugf("seccomp: prepending -ENOSYS stub filter to user filter...")
for idx, insn := range patch {
logrus.Debugf(" [%4.1d] %s", idx, insn)
}
logrus.Debugf(" [....] --- original filter ---")
fprog, err := assemble(fullProgram)
if err != nil {
return nil, fmt.Errorf("error assembling modified filter: %w", err)
}
return fprog, nil
}
func filterFlags(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) {
// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
apiLevel, _ := libseccomp.GetAPI()
noNewPrivs, err = filter.GetNoNewPrivsBit()
if err != nil {
return 0, false, fmt.Errorf("unable to fetch no_new_privs filter bit: %w", err)
}
if apiLevel >= 3 {
if logBit, err := filter.GetLogBit(); err != nil {
return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_LOG bit: %w", err)
} else if logBit {
flags |= uint(C.C_FILTER_FLAG_LOG)
}
}
if apiLevel >= 4 {
if ssb, err := filter.GetSSB(); err != nil {
return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_SPEC_ALLOW bit: %w", err)
} else if ssb {
flags |= uint(C.C_FILTER_FLAG_SPEC_ALLOW)
}
}
// XXX: add newly supported filter flags above this line.
for _, call := range config.Syscalls {
if call.Action == configs.Notify {
flags |= uint(C.C_FILTER_FLAG_NEW_LISTENER)
break
}
}
return
}
func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err error) {
// This debug output is validated in tests/integration/seccomp.bats
// by the SECCOMP_FILTER_FLAG_* test.
logrus.Debugf("seccomp filter flags: %d", flags)
fprog := unix.SockFprog{
Len: uint16(len(filter)),
Filter: &filter[0],
}
fd = -1 // only return a valid fd when C_FILTER_FLAG_NEW_LISTENER is set
// If no seccomp flags were requested we can use the old-school prctl(2).
if flags == 0 {
err = unix.Prctl(unix.PR_SET_SECCOMP,
unix.SECCOMP_MODE_FILTER,
uintptr(unsafe.Pointer(&fprog)), 0, 0)
} else {
fdptr, _, errno := unix.RawSyscall(unix.SYS_SECCOMP,
uintptr(C.C_SET_MODE_FILTER),
uintptr(flags), uintptr(unsafe.Pointer(&fprog)))
if errno != 0 {
err = errno
}
if flags&uint(C.C_FILTER_FLAG_NEW_LISTENER) != 0 {
fd = int(fdptr)
}
}
runtime.KeepAlive(filter)
runtime.KeepAlive(fprog)
return
}
// PatchAndLoad takes a seccomp configuration and a libseccomp filter which has
// been pre-configured with the set of rules in the seccomp config. It then
// patches said filter to handle -ENOSYS in a much nicer manner than the
// default libseccomp default action behaviour, and loads the patched filter
// into the kernel for the current process.
func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) {
// Generate a patched filter.
fprog, err := enosysPatchFilter(config, filter)
if err != nil {
return -1, fmt.Errorf("error patching filter: %w", err)
}
// Get the set of libseccomp flags set.
seccompFlags, noNewPrivs, err := filterFlags(config, filter)
if err != nil {
return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err)
}
// Set no_new_privs if it was requested, though in runc we handle
// no_new_privs separately so warn if we hit this path.
if noNewPrivs {
logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path")
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err)
}
}
// Finally, load the filter.
fd, err := sysSeccompSetFilter(seccompFlags, fprog)
if err != nil {
return -1, fmt.Errorf("error loading seccomp filter: %w", err)
}
return fd, nil
}
|