From d07e46090186d04365229e85a31cb959260e7f76 Mon Sep 17 00:00:00 2001
From: Michael Vogt <mvo@ubuntu.com>
Date: Mon, 10 Jul 2023 13:59:33 +0200
Subject: [PATCH] snap-{seccomp,confine}: rework seccomp denylist

When a denylist was introduced in PR#12849 we reached the limits
of the API of libseccomp and some things are now hard to reason
about [1]. This is mostly because what we try to do does not
match the libseccomp API very well and a slightly different
approach is probably more aligned with it's design (see also
this libseccomp issue [2] that is related to our issue).

So this commit changes the approach and instead of trying to use
a single filter the work is split into two filters:
1. explicit allow list
2. explicit deny list

and then load both filters into the kernel. The way the kernel
works is that it selects the most restrictive action.

So in the case of PR#12849:
```
~ioctl - TIOCSTI
~ioctl - TIOCLINUX
ioctl
```
For `ioctl(TIOCLINUX)` the first allow filter would pass `ioctl`
but the second deny filter would correctly deny the TIOCLINUX.

[1] https://github.com/snapcore/snapd/pull/12849#discussion_r1206855700
[2] https://github.com/seccomp/libseccomp/issues/44
---
 cmd/snap-confine/seccomp-support.c | 42 ++++++++++---
 cmd/snap-confine/seccomp-support.h |  4 ++
 cmd/snap-seccomp/main.go           | 94 +++++++++++++++++++++---------
 cmd/snap-seccomp/main_test.go      | 67 ++++++++++++++++-----
 interfaces/seccomp/template.go     |  9 +--
 tests/main/snap-seccomp/task.yaml  | 44 +++++++-------
 6 files changed, 179 insertions(+), 81 deletions(-)

diff --git a/cmd/snap-confine/seccomp-support.c b/cmd/snap-confine/seccomp-support.c
index 7432d79cb15..9c12c142f33 100644
--- a/cmd/snap-confine/seccomp-support.c
+++ b/cmd/snap-confine/seccomp-support.c
@@ -109,13 +109,38 @@ static void validate_bpfpath_is_safe(const char *path)
 	}
 }
 
-bool sc_apply_seccomp_profile_for_security_tag(const char *security_tag)
-{
+bool sc_apply_seccomp_profile_for_security_tag(const char *security_tag) {
 	debug("loading bpf program for security tag %s", security_tag);
 
 	char profile_path[PATH_MAX] = { 0 };
-	sc_must_snprintf(profile_path, sizeof(profile_path), "%s/%s.bin",
+
+        struct sock_fprog prog_allow = { 0 };
+	sc_must_snprintf(profile_path, sizeof(profile_path), "%s/%s.bin.allow",
+			 filter_profile_dir, security_tag);
+        if (!sc_load_seccomp_profile_path(profile_path, &prog_allow)) {
+           return false;
+        }
+
+        struct sock_fprog prog_deny  = { 0 };
+	sc_must_snprintf(profile_path, sizeof(profile_path), "%s/%s.bin.deny",
 			 filter_profile_dir, security_tag);
+        if (!sc_load_seccomp_profile_path(profile_path, &prog_deny)) {
+           return false;
+        }
+
+        sc_apply_seccomp_filter(&prog_deny);
+        sc_apply_seccomp_filter(&prog_allow);
+
+        // XXX: this is not ideal, allocation happens in
+        // sc_load_seccomp_profile but free here :(
+        free(prog_allow.filter);
+        free(prog_deny.filter);
+        return true;
+}
+
+bool sc_load_seccomp_profile_path(const char *profile_path, struct sock_fprog *prog)
+{
+	debug("loading bpf program from file %s", profile_path);
 
 	// Wait some time for the security profile to show up. When
 	// the system boots snapd will created security profiles, but
@@ -165,11 +190,12 @@ bool sc_apply_seccomp_profile_for_security_tag(const char *security_tag)
 	if (sc_streq(bpf, "@unrestricted\n")) {
 		return false;
 	}
-	struct sock_fprog prog = {
-		.len = num_read / sizeof(struct sock_filter),
-		.filter = (struct sock_filter *)bpf,
-	};
-	sc_apply_seccomp_filter(&prog);
+	prog->len = num_read / sizeof(struct sock_filter);
+        prog->filter = (struct sock_filter *)malloc(num_read);
+        if (prog->filter == NULL) {
+           die("cannot allocate %li bytes of memory for seccomp filter ", num_read);
+        }
+        memcpy(prog->filter, bpf, num_read);
 	return true;
 }
 
diff --git a/cmd/snap-confine/seccomp-support.h b/cmd/snap-confine/seccomp-support.h
index 9ed9b0aaaab..b387913e471 100644
--- a/cmd/snap-confine/seccomp-support.h
+++ b/cmd/snap-confine/seccomp-support.h
@@ -19,6 +19,10 @@
 
 #include <stdbool.h>
 
+#include <linux/filter.h>
+
+bool sc_load_seccomp_profile_path(const char *profile_path, struct sock_fprog *prog);
+
 /** 
  * sc_apply_seccomp_profile_for_security_tag applies a seccomp profile to the
  * current process. The filter is loaded from a pre-compiled bpf bytecode
diff --git a/cmd/snap-seccomp/main.go b/cmd/snap-seccomp/main.go
index 4ff3799b0a1..b0df5941ddb 100644
--- a/cmd/snap-seccomp/main.go
+++ b/cmd/snap-seccomp/main.go
@@ -583,11 +583,12 @@ var (
 	errnoOnImplicitDenial int16 = C.EPERM
 )
 
-func parseLine(line string, secFilter *seccomp.ScmpFilter) error {
+func parseLine(line string, secFilterAllow, secFilterDeny *seccomp.ScmpFilter) error {
 	// ignore comments and empty lines
 	if strings.HasPrefix(line, "#") || line == "" {
 		return nil
 	}
+	secFilter := secFilterAllow
 
 	// regular line
 	tokens := strings.Fields(line)
@@ -604,6 +605,7 @@ func parseLine(line string, secFilter *seccomp.ScmpFilter) error {
 	if strings.HasPrefix(syscallName, "~") {
 		action = seccomp.ActErrno.SetReturnCode(errnoOnExplicitDenial)
 		syscallName = syscallName[1:]
+		secFilter = secFilterDeny
 	}
 
 	secSyscall, err := seccomp.GetSyscallFromName(syscallName)
@@ -789,27 +791,44 @@ func complainAction() seccomp.ScmpAction {
 	return seccomp.ActAllow
 }
 
+func newComplainFilter(complainAct seccomp.ScmpAction) (*seccomp.ScmpFilter, error) {
+	secFilter, err := seccomp.NewFilter(complainAct)
+	if err != nil {
+		if complainAct != seccomp.ActAllow {
+			// ActLog is only supported in newer versions
+			// of the kernel, libseccomp, and
+			// libseccomp-golang. Attempt to fall back to
+			// ActAllow before erroring out.
+			complainAct = seccomp.ActAllow
+			secFilter, err = seccomp.NewFilter(complainAct)
+		}
+	}
+	return secFilter, err
+}
+
 func compile(content []byte, out string) error {
 	var err error
-	var secFilter *seccomp.ScmpFilter
+	var secFilterAllow, secFilterDeny *seccomp.ScmpFilter
 
 	unrestricted, complain := preprocess(content)
 	switch {
 	case unrestricted:
-		return osutil.AtomicWrite(out, bytes.NewBufferString("@unrestricted\n"), 0644, 0)
+		for _, ext := range []string{".allow", ".deny"} {
+			if err := osutil.AtomicWrite(out+ext, bytes.NewBufferString("@unrestricted\n"), 0644, 0); err != nil {
+				return err
+			}
+		}
+		return nil
 	case complain:
 		var complainAct seccomp.ScmpAction = complainAction()
 
-		secFilter, err = seccomp.NewFilter(complainAct)
+		secFilterAllow, err = newComplainFilter(complainAct)
 		if err != nil {
-			if complainAct != seccomp.ActAllow {
-				// ActLog is only supported in newer versions
-				// of the kernel, libseccomp, and
-				// libseccomp-golang. Attempt to fall back to
-				// ActAllow before erroring out.
-				complainAct = seccomp.ActAllow
-				secFilter, err = seccomp.NewFilter(complainAct)
-			}
+			return fmt.Errorf("cannot create seccomp filter: %s", err)
+		}
+		secFilterDeny, err = newComplainFilter(complainAct)
+		if err != nil {
+			return fmt.Errorf("cannot create seccomp filter: %s", err)
 		}
 
 		// Set unrestricted to 'true' to fallback to the pre-ActLog
@@ -819,19 +838,26 @@ func compile(content []byte, out string) error {
 			unrestricted = true
 		}
 	default:
-		secFilter, err = seccomp.NewFilter(seccomp.ActErrno.SetReturnCode(errnoOnImplicitDenial))
+		secFilterAllow, err = seccomp.NewFilter(seccomp.ActErrno.SetReturnCode(errnoOnImplicitDenial))
+		if err != nil {
+			return fmt.Errorf("cannot create seccomp filter: %s", err)
+		}
+		secFilterDeny, err = seccomp.NewFilter(seccomp.ActAllow)
+		if err != nil {
+			return fmt.Errorf("cannot create seccomp filter: %s", err)
+		}
 	}
-	if err != nil {
-		return fmt.Errorf("cannot create seccomp filter: %s", err)
+	if err := addSecondaryArches(secFilterAllow); err != nil {
+		return err
 	}
-	if err := addSecondaryArches(secFilter); err != nil {
+	if err := addSecondaryArches(secFilterDeny); err != nil {
 		return err
 	}
 
 	if !unrestricted {
 		scanner := bufio.NewScanner(bytes.NewBuffer(content))
 		for scanner.Scan() {
-			if err := parseLine(scanner.Text(), secFilter); err != nil {
+			if err := parseLine(scanner.Text(), secFilterAllow, secFilterDeny); err != nil {
 				return fmt.Errorf("cannot parse line: %s", err)
 			}
 		}
@@ -841,21 +867,33 @@ func compile(content []byte, out string) error {
 	}
 
 	if osutil.GetenvBool("SNAP_SECCOMP_DEBUG") {
-		secFilter.ExportPFC(os.Stdout)
+		secFilterAllow.ExportPFC(os.Stdout)
+		secFilterDeny.ExportPFC(os.Stdout)
 	}
 
 	// write atomically
-	fout, err := osutil.NewAtomicFile(out, 0644, 0, osutil.NoChown, osutil.NoChown)
-	if err != nil {
-		return err
-	}
-	// Cancel once Committed is a NOP
-	defer fout.Cancel()
-
-	if err := secFilter.ExportBPF(fout.File); err != nil {
-		return err
+	for _, ext := range []string{".allow", ".deny"} {
+		fout, err := osutil.NewAtomicFile(out+ext, 0644, 0, osutil.NoChown, osutil.NoChown)
+		if err != nil {
+			return err
+		}
+		// Cancel once Committed is a NOP
+		defer fout.Cancel()
+
+		switch ext {
+		case ".allow":
+			err = secFilterAllow.ExportBPF(fout.File)
+		case ".deny":
+			err = secFilterDeny.ExportBPF(fout.File)
+		}
+		if err != nil {
+			return err
+		}
+		if err := fout.Commit(); err != nil {
+			return err
+		}
 	}
-	return fout.Commit()
+	return nil
 }
 
 // caches for uid and gid lookups
diff --git a/cmd/snap-seccomp/main_test.go b/cmd/snap-seccomp/main_test.go
index 38db6fa16e8..b61e7a67769 100644
--- a/cmd/snap-seccomp/main_test.go
+++ b/cmd/snap-seccomp/main_test.go
@@ -71,18 +71,45 @@ var seccompBpfLoaderContent = []byte(`
 
 #define MAX_BPF_SIZE 32 * 1024
 
-int sc_apply_seccomp_bpf(const char* profile_path)
+int sc_apply_seccomp_bpf(const char* profile_path_allow, const char* profile_path_deny)
 {
-    unsigned char bpf[MAX_BPF_SIZE + 1]; // account for EOF
+    unsigned char bpf_allow[MAX_BPF_SIZE + 1]; // account for EOF
+    unsigned char bpf_deny[MAX_BPF_SIZE + 1]; // account for EOF
     FILE* fp;
-    fp = fopen(profile_path, "rb");
+
+    // allow
+    fp = fopen(profile_path_allow, "rb");
+    if (fp == NULL) {
+        fprintf(stderr, "cannot read %s\n", profile_path_allow);
+        return -1;
+    }
+
+    // set 'size' to 1; to get bytes transferred
+    size_t num_read = fread(bpf_allow, 1, sizeof(bpf_allow), fp);
+
+    if (ferror(fp) != 0) {
+        perror("fread()");
+        return -1;
+    } else if (feof(fp) == 0) {
+        fprintf(stderr, "file too big\n");
+        return -1;
+    }
+    fclose(fp);
+
+    struct sock_fprog prog_allow = {
+        .len = num_read / sizeof(struct sock_filter),
+        .filter = (struct sock_filter*)bpf_allow,
+    };
+
+    // deny
+    fp = fopen(profile_path_deny, "rb");
     if (fp == NULL) {
-        fprintf(stderr, "cannot read %s\n", profile_path);
+        fprintf(stderr, "cannot read %s\n", profile_path_deny);
         return -1;
     }
 
     // set 'size' to 1; to get bytes transferred
-    size_t num_read = fread(bpf, 1, sizeof(bpf), fp);
+    num_read = fread(bpf_deny, 1, sizeof(bpf_deny), fp);
 
     if (ferror(fp) != 0) {
         perror("fread()");
@@ -93,9 +120,9 @@ int sc_apply_seccomp_bpf(const char* profile_path)
     }
     fclose(fp);
 
-    struct sock_fprog prog = {
+    struct sock_fprog prog_deny = {
         .len = num_read / sizeof(struct sock_filter),
-        .filter = (struct sock_filter*)bpf,
+        .filter = (struct sock_filter*)bpf_deny,
     };
 
     // Set NNP to allow loading seccomp policy into the kernel without
@@ -105,10 +132,15 @@ int sc_apply_seccomp_bpf(const char* profile_path)
         return -1;
     }
 
-    if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
-        perror("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, ...) failed");
+    if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_deny)) {
+        perror("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, ...) deny failed");
         return -1;
     }
+    if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_allow)) {
+        perror("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, ...) allow failed");
+        return -1;
+    }
+
     return 0;
 }
 
@@ -116,15 +148,16 @@ int main(int argc, char* argv[])
 {
     int rc = 0;
     if (argc < 2) {
-        fprintf(stderr, "Usage: %s <bpf file> [prog ...]\n", argv[0]);
+        fprintf(stderr, "Usage: %s <bpf file-allow> <bpf-file-deny> [prog ...]\n", argv[0]);
         return 1;
     }
 
-    rc = sc_apply_seccomp_bpf(argv[1]);
+    // allow, deny
+    rc = sc_apply_seccomp_bpf(argv[1], argv[2]);
     if (rc != 0)
         return -rc;
 
-    execv(argv[2], (char* const*)&argv[2]);
+    execv(argv[3], (char* const*)&argv[3]);
     perror("execv failed");
     return 1;
 }
@@ -333,7 +366,7 @@ mprotect
 		}
 	}
 
-	cmd := exec.Command(s.seccompBpfLoader, bpfPath, syscallRunner, syscallRunnerArgs[0], syscallRunnerArgs[1], syscallRunnerArgs[2], syscallRunnerArgs[3], syscallRunnerArgs[4], syscallRunnerArgs[5], syscallRunnerArgs[6])
+	cmd := exec.Command(s.seccompBpfLoader, bpfPath+".allow", bpfPath+".deny", syscallRunner, syscallRunnerArgs[0], syscallRunnerArgs[1], syscallRunnerArgs[2], syscallRunnerArgs[3], syscallRunnerArgs[4], syscallRunnerArgs[5], syscallRunnerArgs[6])
 	cmd.Stdin = os.Stdin
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
@@ -363,7 +396,8 @@ func (s *snapSeccompSuite) TestUnrestricted(c *C) {
 	err := main.Compile([]byte(inp), outPath)
 	c.Assert(err, IsNil)
 
-	c.Check(outPath, testutil.FileEquals, inp)
+	c.Check(outPath+".allow", testutil.FileEquals, inp)
+	c.Check(outPath+".deny", testutil.FileEquals, inp)
 }
 
 // TestCompile iterates over a range of textual seccomp whitelist rules and
@@ -395,8 +429,9 @@ func (s *snapSeccompSuite) TestCompile(c *C) {
 		{"read", "read", Allow},
 		{"read\nwrite\nexecve\n", "write", Allow},
 
-		// trivial denial
-		{"read", "ioctl", Deny},
+		// trivial denial (uses write in allow-list to ensure any
+		// errors printing is visible)
+		{"write", "ioctl", Deny},
 
 		// test argument filtering syntax, we currently support:
 		//   >=, <=, !, <, >, |
diff --git a/interfaces/seccomp/template.go b/interfaces/seccomp/template.go
index e92bafc3bde..3006e791bd4 100644
--- a/interfaces/seccomp/template.go
+++ b/interfaces/seccomp/template.go
@@ -205,14 +205,7 @@ inotify_rm_watch
 # TODO: this should be scaled back even more
 ~ioctl - TIOCSTI
 ~ioctl - TIOCLINUX
-# restrict argument otherwise will match all uses of ioctl() and allow the rules
-# that were disallowed above
-# TODO: Fix the need to keep TIOCLINUX here - the issue is a unrestricted
-#       allow for "ioctl" here makes libseccomp "optimize" the deny rules
-#       above away and the generated bpf becomes just "allow ioctl".
-#       We should fix this by creating a way to make "AND" rules, so
-#       this becomes "ioctl - !TIOCSTI&&!TIOCLINUX" and remove the "~" again.
-ioctl - !TIOCSTI
+ioctl
 
 io_cancel
 io_destroy
diff --git a/tests/main/snap-seccomp/task.yaml b/tests/main/snap-seccomp/task.yaml
index b81a663bd91..a9ca07b59c4 100644
--- a/tests/main/snap-seccomp/task.yaml
+++ b/tests/main/snap-seccomp/task.yaml
@@ -1,8 +1,6 @@
 summary: Ensure that the snap-seccomp bpf handling works
 
-# FIXME: once $(snap debug confinment) can be used (in 2.27+) remove
-#        the systems line
-systems: [ubuntu-16*, ubuntu-18*]
+systems: [ubuntu-*]
 
 # Start early as it takes a long time.
 priority: 100
@@ -25,7 +23,7 @@ execute: |
 
     # from the old test_complain
     echo "Test that the @complain keyword works"
-    rm -f "${PROFILE}.bin"
+    rm -f "${PROFILE}.bin"*
     cat >"${PROFILE}.src" <<EOF
     # some comment
     @complain
@@ -35,7 +33,7 @@ execute: |
     test-snapd-sh.sh -c 'echo hello' | MATCH hello
 
     # from the old test_complain_missed
-    rm -f "${PROFILE}.bin"
+    rm -f "${PROFILE}.bin"*
     cat >"${PROFILE}.src" <<EOF
     # super strict filter
     @complai
@@ -53,7 +51,7 @@ execute: |
     
     # from the old test_unrestricted
     echo "Test that the @unrestricted keyword works"
-    rm -f "${PROFILE}.bin"
+    rm -f "${PROFILE}.bin"*
     cat >"${PROFILE}.src" <<EOF
     # some comment
     @unrestricted
@@ -63,7 +61,7 @@ execute: |
     test-snapd-sh.sh -c 'echo hello' | MATCH hello
 
     # from the old test_unrestricted_missed
-    rm -f "${PROFILE}.bin"
+    rm -f "${PROFILE}.bin"*
     cat >"${PROFILE}.src" <<EOF
     # super strict filter
     @unrestricte
@@ -80,7 +78,7 @@ execute: |
     fi
 
     # from the old test_noprofile
-    rm -f "${PROFILE}.bin"
+    rm -f "${PROFILE}.bin"*
     echo "Ensure the code cannot not run due to missing filter"
     if SNAP_CONFINE_MAX_PROFILE_WAIT=3 test-snapd-sh.sh -c 'echo hello'; then
         echo "filtering broken: program should have failed to run"
@@ -88,31 +86,35 @@ execute: |
     fi
 
     echo "Break snapd.test-snapd-sh.bin to ensure (kernel) validation works"
-    dd if=/dev/urandom of="${PROFILE}.bin" count=1 bs=1024
-    if output=$(test-snapd-sh.sh -c 'echo hello' 2>&1 ); then
-        echo "test-snapd-sh.sh should fail with invalid seccomp profile"
-        exit 1
-    fi
+    for ext in allow deny; do
+        dd if=/dev/urandom of="${PROFILE}.bin.$ext" count=1 bs=1024
+        if output=$(test-snapd-sh.sh -c 'echo hello' 2>&1 ); then
+            echo "test-snapd-sh.sh should fail with invalid seccomp profile"
+            exit 1
+        fi
+    done
     echo "$output" | MATCH "cannot apply seccomp profile: Invalid argument"
 
     echo "Add huge snapd.test-snapd-sh.bin to ensure size limit works"
-    dd if=/dev/zero of="${PROFILE}.bin" count=50 bs=1M
-    if output=$(test-snapd-sh.sh -c 'echo hello' 2>&1 ); then
-        echo "test-snapd-sh.sh should fail with big seccomp profile"
-        exit 1
-    fi
+    for ext in allow deny; do
+        dd if=/dev/zero of="${PROFILE}.bin.$ext" count=50 bs=1M
+        if output=$(test-snapd-sh.sh -c 'echo hello' 2>&1 ); then
+            echo "test-snapd-sh.sh should fail with big seccomp profile"
+            exit 1
+        fi
+    done
     echo "$output" | MATCH "cannot fit .* to memory buffer"
 
     
     echo "Ensure the code cannot not run with a missing .bin profile"
-    rm -f "${PROFILE}.bin"
+    rm -f "${PROFILE}.bin"*
     if test-snapd-sh.sh -c 'echo hello'; then
         echo "filtering broken: program should have failed to run"
         exit 1
     fi
 
     echo "Ensure the code cannot not run with an empty seccomp profile"
-    rm -f "${PROFILE}.bin"
+    rm -f "${PROFILE}.bin"*
     echo "" > "${PROFILE}.src"
     $SNAP_SECCOMP compile "${PROFILE}.src" "${PROFILE}.bin"
     if test-snapd-sh.sh -c 'echo hello'; then
@@ -121,7 +123,7 @@ execute: |
     fi
 
     echo "Ensure snap-confine waits for security profiles to appear"
-    rm -f "${PROFILE}.bin"
+    rm -f "${PROFILE}.bin"*
     cat >"${PROFILE}.src" <<EOF
     @unrestricted
     EOF