Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

seccomp: patchbpf: always include native architecture in stub #4219

Merged
merged 2 commits into from
Mar 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 59 additions & 44 deletions libcontainer/seccomp/patchbpf/enosys_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,87 +171,101 @@ func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error)
return program, nil
}

type nativeArch uint32
type linuxAuditArch uint32

const invalidArch nativeArch = 0
const invalidArch linuxAuditArch = 0

func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
func scmpArchToAuditArch(arch libseccomp.ScmpArch) (linuxAuditArch, error) {
switch arch {
case libseccomp.ArchNative:
// Convert to actual native architecture.
arch, err := libseccomp.GetNativeArch()
if err != nil {
return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
}
return archToNative(arch)
return scmpArchToAuditArch(arch)
case libseccomp.ArchX86:
return nativeArch(C.C_AUDIT_ARCH_I386), nil
return linuxAuditArch(C.C_AUDIT_ARCH_I386), nil
case libseccomp.ArchAMD64, libseccomp.ArchX32:
// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
// 30th bit of the syscall number set to indicate that it's not a
// normal x86_64 syscall.
return nativeArch(C.C_AUDIT_ARCH_X86_64), nil
return linuxAuditArch(C.C_AUDIT_ARCH_X86_64), nil
case libseccomp.ArchARM:
return nativeArch(C.C_AUDIT_ARCH_ARM), nil
return linuxAuditArch(C.C_AUDIT_ARCH_ARM), nil
case libseccomp.ArchARM64:
return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil
return linuxAuditArch(C.C_AUDIT_ARCH_AARCH64), nil
case libseccomp.ArchMIPS:
return nativeArch(C.C_AUDIT_ARCH_MIPS), nil
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS), nil
case libseccomp.ArchMIPS64:
return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64), nil
case libseccomp.ArchMIPS64N32:
return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil
return linuxAuditArch(C.C_AUDIT_ARCH_MIPS64N32), nil
case libseccomp.ArchMIPSEL:
return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL), nil
case libseccomp.ArchMIPSEL64:
return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64), nil
case libseccomp.ArchMIPSEL64N32:
return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
return linuxAuditArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
case libseccomp.ArchPPC:
return nativeArch(C.C_AUDIT_ARCH_PPC), nil
return linuxAuditArch(C.C_AUDIT_ARCH_PPC), nil
case libseccomp.ArchPPC64:
return nativeArch(C.C_AUDIT_ARCH_PPC64), nil
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64), nil
case libseccomp.ArchPPC64LE:
return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil
return linuxAuditArch(C.C_AUDIT_ARCH_PPC64LE), nil
case libseccomp.ArchS390:
return nativeArch(C.C_AUDIT_ARCH_S390), nil
return linuxAuditArch(C.C_AUDIT_ARCH_S390), nil
case libseccomp.ArchS390X:
return nativeArch(C.C_AUDIT_ARCH_S390X), nil
return linuxAuditArch(C.C_AUDIT_ARCH_S390X), nil
case libseccomp.ArchRISCV64:
return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil
return linuxAuditArch(C.C_AUDIT_ARCH_RISCV64), nil
default:
return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
}
}

type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
type lastSyscallMap map[linuxAuditArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall

// Figure out largest syscall number referenced in the filter for each
// architecture. We will be generating code based on the native architecture
// representation, but SCMP_ARCH_X32 means we have to track cases where the
// same architecture has different largest syscalls based on the mode.
func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
lastSyscalls := make(lastSyscallMap)
// Only loop over architectures which are present in the filter. Any other
// architectures will get the libseccomp bad architecture action anyway.
scmpArchs := make(map[libseccomp.ScmpArch]struct{})
for _, ociArch := range config.Architectures {
arch, err := libseccomp.GetArchFromString(ociArch)
if err != nil {
return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
}
scmpArchs[arch] = struct{}{}
}
// On architectures like ppc64le, Docker inexplicably doesn't include the
// native architecture in the architecture list which results in no
// architectures being present in the list at all (rendering the ENOSYS
// stub a no-op). So, always include the native architecture.
if nativeScmpArch, err := libseccomp.GetNativeArch(); err != nil {
return nil, fmt.Errorf("unable to get native arch: %w", err)
} else if _, ok := scmpArchs[nativeScmpArch]; !ok {
logrus.Debugf("seccomp: adding implied native architecture %v to config set", nativeScmpArch)
scmpArchs[nativeScmpArch] = struct{}{}
}
logrus.Debugf("seccomp: configured architecture set: %s", scmpArchs)

// Figure out native architecture representation of the architecture.
nativeArch, err := archToNative(arch)
// Only loop over architectures which are present in the filter. Any other
// architectures will get the libseccomp bad architecture action anyway.
lastSyscalls := make(lastSyscallMap)
for arch := range scmpArchs {
auditArch, err := scmpArchToAuditArch(arch)
if err != nil {
return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
}

if _, ok := lastSyscalls[nativeArch]; !ok {
lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
if _, ok := lastSyscalls[auditArch]; !ok {
lastSyscalls[auditArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
}
if _, ok := lastSyscalls[nativeArch][arch]; ok {
if _, ok := lastSyscalls[auditArch][arch]; ok {
// Because of ArchNative we may hit the same entry multiple times.
// Just skip it if we've seen this (nativeArch, ScmpArch)
// Just skip it if we've seen this (linuxAuditArch, ScmpArch)
// combination before.
continue
}
Expand All @@ -269,10 +283,11 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
}
}
if largestSyscall != 0 {
lastSyscalls[nativeArch][arch] = largestSyscall
logrus.Debugf("seccomp: largest syscall number for arch %v is %v", arch, largestSyscall)
lastSyscalls[auditArch][arch] = largestSyscall
} else {
logrus.Warnf("could not find any syscalls for arch %s", ociArch)
delete(lastSyscalls[nativeArch], arch)
logrus.Warnf("could not find any syscalls for arch %v", arch)
delete(lastSyscalls[auditArch], arch)
}
}
return lastSyscalls, nil
Expand All @@ -290,10 +305,10 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
// close_range(2) which were added out-of-order in the syscall table between
// kernel releases.
func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
// A jump-table for each nativeArch used to generate the initial
// A jump-table for each linuxAuditArch used to generate the initial
// conditional jumps -- measured from the *END* of the program so they
// remain valid after prepending to the tail.
archJumpTable := map[nativeArch]uint32{}
archJumpTable := map[linuxAuditArch]uint32{}

// Generate our own -ENOSYS rules for each architecture. They have to be
// generated in reverse (prepended to the tail of the program) because the
Expand All @@ -306,7 +321,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
}

// Generate the syscall -ENOSYS rules.
for nativeArch, maxSyscalls := range lastSyscalls {
for auditArch, maxSyscalls := range lastSyscalls {
// The number of instructions from the tail of this section which need
// to be jumped in order to reach the -ENOSYS return. If the section
// does not jump, it will fall through to the actual filter.
Expand Down Expand Up @@ -387,7 +402,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)

// If we're on x86 we need to add a check for x32 and if we're in
// the wrong mode we jump over the section.
if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
if uint32(auditArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
// Generate a prefix to check the mode.
switch scmpArch {
case libseccomp.ArchAMD64:
Expand Down Expand Up @@ -416,8 +431,8 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
section = append(section, sectionTail...)
case 2:
// x32 and x86_64 are a unique case, we can't handle any others.
if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
if uint32(auditArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", auditArch)
}

x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
Expand Down Expand Up @@ -494,7 +509,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
programTail = append(section, programTail...)

// Update jump table.
archJumpTable[nativeArch] = uint32(len(programTail))
archJumpTable[auditArch] = uint32(len(programTail))
}

// Add a dummy "jump to filter" for any architecture we might miss below.
Expand All @@ -514,9 +529,9 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// architectures based on how large the jumps are going to be, or
// re-sort the candidate architectures each time to make sure that we
// pick the largest jump which is going to be smaller than 255.
for nativeArch := range lastSyscalls {
for auditArch := range lastSyscalls {
// We jump forwards but the jump table is calculated from the *END*.
jump := uint32(len(programTail)) - archJumpTable[nativeArch]
jump := uint32(len(programTail)) - archJumpTable[auditArch]

// Same routine as above -- this is a basic jeq check, complicated
// slightly if it turns out that we need to do a long jump.
Expand All @@ -525,7 +540,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// jeq [arch],[jump]
bpf.JumpIf{
Cond: bpf.JumpEqual,
Val: uint32(nativeArch),
Val: uint32(auditArch),
SkipTrue: uint8(jump),
},
}, programTail...)
Expand All @@ -534,7 +549,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// jne [arch],1
bpf.JumpIf{
Cond: bpf.JumpNotEqual,
Val: uint32(nativeArch),
Val: uint32(auditArch),
SkipTrue: 1,
},
// ja [jump]
Expand Down
58 changes: 46 additions & 12 deletions libcontainer/seccomp/patchbpf/enosys_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ type seccompData struct {
}

// mockSyscallPayload creates a fake seccomp_data struct with the given data.
func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch nativeArch, args ...uint64) []byte {
func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch linuxAuditArch, args ...uint64) []byte {
var buf bytes.Buffer

data := seccompData{
Expand Down Expand Up @@ -105,8 +105,16 @@ var testArches = []string{
"ppc64le",
"s390",
"s390x",
// Dummy value to indicate a configuration with no architecture specified.
"native",
}

// Used for the "native" architecture.
var (
scmpNativeArch, _ = libseccomp.GetNativeArch()
nativeArch = scmpNativeArch.String()
)

func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) {
explicitSyscalls := []string{
"setns",
Expand Down Expand Up @@ -150,17 +158,20 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)

for _, arch := range testArches {
type syscallTest struct {
syscall string
sysno libseccomp.ScmpSyscall
syscall string
expected uint32
}

if arch == "native" {
arch = nativeArch
}
scmpArch, err := libseccomp.GetArchFromString(arch)
if err != nil {
t.Fatalf("unknown libseccomp architecture %q: %v", arch, err)
}

nativeArch, err := archToNative(scmpArch)
auditArch, err := scmpArchToAuditArch(scmpArch)
if err != nil {
t.Fatalf("unknown audit architecture %q: %v", arch, err)
}
Expand All @@ -179,9 +190,9 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err)
}
syscallTests = append(syscallTests, syscallTest{
syscall,
sysno,
expected,
sysno: sysno,
syscall: syscall,
expected: expected,
})
}

Expand Down Expand Up @@ -228,12 +239,19 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)

// Test syscalls in the explicit list.
for _, test := range syscallTests {
// Override the expected value in the two special cases.
if !archSet[arch] || isAllowAction(defaultAction) {
// Override the expected value in the two special cases:
// 1. If the default action is allow, the filter won't have
// the stub prepended so we expect a fallthrough.
// 2. If the executing architecture is not in the architecture
// set, then the architecture is not handled by the stub --
// *except* in the case of the native architecture (which
// is always included in the stub).
if isAllowAction(defaultAction) ||
(!archSet[arch] && arch != nativeArch) {
test.expected = retFallthrough
}

payload := mockSyscallPayload(t, test.sysno, nativeArch, 0x1337, 0xF00BA5)
payload := mockSyscallPayload(t, test.sysno, auditArch, 0x1337, 0xF00BA5)
// NOTE: golang.org/x/net/bpf returns int here rather
// than uint32.
rawRet, err := filter.Run(payload)
Expand All @@ -247,7 +265,7 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
t.Logf(" [%4.1d] %s", idx, insn)
}
t.Logf("payload: %#v", payload)
t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, nativeArch, test.syscall, test.sysno, ret, test.expected)
t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, auditArch, test.syscall, test.sysno, ret, test.expected)
}
}
}
Expand All @@ -263,7 +281,14 @@ var testActions = map[string]configs.Action{

func TestEnosysStub_SingleArch(t *testing.T) {
for _, arch := range testArches {
arches := []string{arch}
var arches []string
// "native" indicates a blank architecture field for seccomp, to test
// the case where the running architecture was not included in the
// architecture. Docker doesn't always set the architecture for some
// reason (namely for ppc64le).
if arch != "native" {
arches = append(arches, arch)
}
t.Run("arch="+arch, func(t *testing.T) {
for name, action := range testActions {
t.Run("action="+name, func(t *testing.T) {
Expand All @@ -277,7 +302,16 @@ func TestEnosysStub_SingleArch(t *testing.T) {
func TestEnosysStub_MultiArch(t *testing.T) {
for end := 0; end < len(testArches); end++ {
for start := 0; start < end; start++ {
arches := testArches[start:end]
var arches []string
for _, arch := range testArches[start:end] {
// "native" indicates a blank architecture field for seccomp, to test
// the case where the running architecture was not included in the
// architecture. Docker doesn't always set the architecture for some
// reason (namely for ppc64le).
if arch != "native" {
arches = append(arches, arch)
}
}
if len(arches) <= 1 {
continue
}
Expand Down
Loading