Skip to content

Commit

Permalink
seccomp: Add wait_killable semantic to seccomp user notifier
Browse files Browse the repository at this point in the history
The user notifier feature allows for filtering of seccomp notifications in
userspace. While the user notifier is handling the syscall, the notifying
process can be preempted, thus ending the notification. This has become a
growing problem, as Golang has adopted signal based async preemption[1]. In
this, it will preempt every 10ms, thus leaving the supervisor less than
10ms to respond to a given notification. If the syscall require I/O (mount,
connect) on behalf of the process, it can easily take 10ms.

This allows the supervisor to set a flag that moves the process into a
state where it is only killable by terminating signals as opposed to all
signals. The process can still be terminated before the supervisor receives
the notification.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>

[1]: golang/go#24543
  • Loading branch information
sargun authored and intel-lab-lkp committed Apr 26, 2021
1 parent 7e0a7c6 commit cbe9c7e
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 14 deletions.
15 changes: 8 additions & 7 deletions Documentation/userspace-api/seccomp_filter.rst
Original file line number Diff line number Diff line change
Expand Up @@ -251,13 +251,14 @@ seccomp notification fd to receive a ``struct seccomp_notif``, which contains
five members: the input length of the structure, a unique-per-filter ``id``,
the ``pid`` of the task which triggered this request (which may be 0 if the
task is in a pid ns not visible from the listener's pid namespace), a ``flags``
member which for now only has ``SECCOMP_NOTIF_FLAG_SIGNALED``, representing
whether or not the notification is a result of a non-fatal signal, and the
``data`` passed to seccomp. Userspace can then make a decision based on this
information about what to do, and ``ioctl(SECCOMP_IOCTL_NOTIF_SEND)`` a
response, indicating what should be returned to userspace. The ``id`` member of
``struct seccomp_notif_resp`` should be the same ``id`` as in ``struct
seccomp_notif``.
member and the ``data`` passed to seccomp. Upon receiving the notification,
the ``SECCOMP_USER_NOTIF_FLAG_WAIT_KILLABLE`` flag may be set, which will
try to put the task into a state where it will only respond to fatal signals.

Userspace can then make a decision based on this information about what to do,
and ``ioctl(SECCOMP_IOCTL_NOTIF_SEND)`` a response, indicating what should be
returned to userspace. The ``id`` member of ``struct seccomp_notif_resp`` should
be the same ``id`` as in ``struct seccomp_notif``.

It is worth noting that ``struct seccomp_data`` contains the values of register
arguments to the syscall, but does not contain pointers to memory. The task's
Expand Down
3 changes: 3 additions & 0 deletions include/uapi/linux/seccomp.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ struct seccomp_notif_sizes {
__u16 seccomp_data;
};

/* Valid flags for struct seccomp_notif */
#define SECCOMP_USER_NOTIF_FLAG_WAIT_KILLABLE (1UL << 0) /* Prevent task from being interrupted */

struct seccomp_notif {
__u64 id;
__u32 pid;
Expand Down
54 changes: 47 additions & 7 deletions kernel/seccomp.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ struct seccomp_knotif {

/* outstanding addfd requests */
struct list_head addfd;

bool wait_killable;
};

/**
Expand Down Expand Up @@ -1073,6 +1075,11 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
complete(&addfd->completion);
}

static bool notification_interruptible(struct seccomp_knotif *n)
{
return !(n->state == SECCOMP_NOTIFY_SENT && n->wait_killable);
}

static int seccomp_do_user_notification(int this_syscall,
struct seccomp_filter *match,
const struct seccomp_data *sd)
Expand All @@ -1082,6 +1089,7 @@ static int seccomp_do_user_notification(int this_syscall,
long ret = 0;
struct seccomp_knotif n = {};
struct seccomp_kaddfd *addfd, *tmp;
bool interruptible = true;

mutex_lock(&match->notify_lock);
err = -ENOSYS;
Expand All @@ -1103,11 +1111,31 @@ static int seccomp_do_user_notification(int this_syscall,
* This is where we wait for a reply from userspace.
*/
do {
interruptible = notification_interruptible(&n);

mutex_unlock(&match->notify_lock);
err = wait_for_completion_interruptible(&n.ready);
if (interruptible)
err = wait_for_completion_interruptible(&n.ready);
else
err = wait_for_completion_killable(&n.ready);
mutex_lock(&match->notify_lock);
if (err != 0)

if (err != 0) {
/*
* There is a race condition here where if the
* notification was received with the
* SECCOMP_USER_NOTIF_FLAG_WAIT_KILLABLE flag, but a
* non-fatal signal was received before we could
* transition we could erroneously end our wait early.
*
* The next wait for completion will ensure the signal
* was not fatal.
*/
if (interruptible && !notification_interruptible(&n))
continue;

goto interrupted;
}

addfd = list_first_entry_or_null(&n.addfd,
struct seccomp_kaddfd, list);
Expand Down Expand Up @@ -1422,14 +1450,16 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
struct seccomp_notif unotif;
ssize_t ret;

ret = copy_from_user(&unotif, buf, sizeof(unotif));
if (ret)
return -EFAULT;

/* Verify that we're not given garbage to keep struct extensible. */
ret = check_zeroed_user(buf, sizeof(unotif));
if (ret < 0)
return ret;
if (!ret)
if (unotif.flags & ~(SECCOMP_USER_NOTIF_FLAG_WAIT_KILLABLE))
return -EINVAL;

memset(&unotif, 0, sizeof(unotif));
if (unotif.id || unotif.pid)
return -EINVAL;

ret = down_interruptible(&filter->notif->request);
if (ret < 0)
Expand Down Expand Up @@ -1457,6 +1487,12 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
unotif.pid = task_pid_vnr(knotif->task);
unotif.data = *(knotif->data);

if (unotif.flags & SECCOMP_USER_NOTIF_FLAG_WAIT_KILLABLE) {
knotif->wait_killable = true;
complete(&knotif->ready);
}


knotif->state = SECCOMP_NOTIFY_SENT;
wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM);
ret = 0;
Expand All @@ -1477,6 +1513,10 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
if (knotif) {
knotif->state = SECCOMP_NOTIFY_INIT;
up(&filter->notif->request);

/* Wake the task to reset its state */
if (knotif->wait_killable)
complete(&knotif->ready);
}
mutex_unlock(&filter->notify_lock);
}
Expand Down

0 comments on commit cbe9c7e

Please sign in to comment.