From 4943b66df18a0e8aedd006792ed73257cd2da8f8 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 7 Mar 2023 23:31:56 -0800 Subject: [PATCH 01/10] seccomp: don't use semaphore and wait_queue together The main reason is to use new wake_up helpers that will be added in the following patches. But here are a few other reasons: * if we use two different ways, we always need to call them both. This patch fixes seccomp_notify_recv where we forgot to call wake_up_poll in the error path. * If we use one primitive, we can control how many waiters are woken up for each request. Our goal is to wake up just one that will handle a request. Right now, wake_up_poll can wake up one waiter and up(&match->notif->request) can wake up one more. Signed-off-by: Andrei Vagin Acked-by: "Peter Zijlstra (Intel)" Link: https://lore.kernel.org/r/20230308073201.3102738-2-avagin@google.com Signed-off-by: Kees Cook --- kernel/seccomp.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d3e584065c7f..1386dcedda1a 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -145,7 +145,7 @@ struct seccomp_kaddfd { * @notifications: A list of struct seccomp_knotif elements. */ struct notification { - struct semaphore request; + atomic_t requests; u64 next_id; struct list_head notifications; }; @@ -1116,7 +1116,7 @@ static int seccomp_do_user_notification(int this_syscall, list_add_tail(&n.list, &match->notif->notifications); INIT_LIST_HEAD(&n.addfd); - up(&match->notif->request); + atomic_inc(&match->notif->requests); wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); /* @@ -1450,6 +1450,37 @@ find_notification(struct seccomp_filter *filter, u64 id) return NULL; } +static int recv_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, + void *key) +{ + /* Avoid a wakeup if event not interesting for us. */ + if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +static int recv_wait_event(struct seccomp_filter *filter) +{ + DEFINE_WAIT_FUNC(wait, recv_wake_function); + int ret; + + if (atomic_dec_if_positive(&filter->notif->requests) >= 0) + return 0; + + for (;;) { + ret = prepare_to_wait_event(&filter->wqh, &wait, TASK_INTERRUPTIBLE); + + if (atomic_dec_if_positive(&filter->notif->requests) >= 0) + break; + + if (ret) + return ret; + + schedule(); + } + finish_wait(&filter->wqh, &wait); + return 0; +} static long seccomp_notify_recv(struct seccomp_filter *filter, void __user *buf) @@ -1467,7 +1498,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter, memset(&unotif, 0, sizeof(unotif)); - ret = down_interruptible(&filter->notif->request); + ret = recv_wait_event(filter); if (ret < 0) return ret; @@ -1515,7 +1546,8 @@ out: if (should_sleep_killable(filter, knotif)) complete(&knotif->ready); knotif->state = SECCOMP_NOTIFY_INIT; - up(&filter->notif->request); + atomic_inc(&filter->notif->requests); + wake_up_poll(&filter->wqh, EPOLLIN | EPOLLRDNORM); } mutex_unlock(&filter->notify_lock); } @@ -1777,7 +1809,6 @@ static struct file *init_listener(struct seccomp_filter *filter) if (!filter->notif) goto out; - sema_init(&filter->notif->request, 0); filter->notif->next_id = get_random_u64(); INIT_LIST_HEAD(&filter->notif->notifications); From ab83f455f04df5b2f7c6d4de03b6d2eaeaa27b8a Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Tue, 7 Mar 2023 23:31:57 -0800 Subject: [PATCH 02/10] sched: add WF_CURRENT_CPU and externise ttwu Add WF_CURRENT_CPU wake flag that advices the scheduler to move the wakee to the current CPU. This is useful for fast on-CPU context switching use cases. In addition, make ttwu external rather than static so that the flag could be passed to it from outside of sched/core.c. Signed-off-by: Peter Oskolkov Signed-off-by: Andrei Vagin Acked-by: "Peter Zijlstra (Intel)" Link: https://lore.kernel.org/r/20230308073201.3102738-3-avagin@google.com Signed-off-by: Kees Cook --- kernel/sched/core.c | 3 +-- kernel/sched/fair.c | 4 ++++ kernel/sched/sched.h | 13 ++++++++----- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c52c2eba7c73..1574989627b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4193,8 +4193,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. */ -static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int cpu, success = 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b3e25be58e2b..ceb5d4c4738e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7741,6 +7741,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) if (wake_flags & WF_TTWU) { record_wakee(p); + if ((wake_flags & WF_CURRENT_CPU) && + cpumask_test_cpu(cpu, p->cpus_ptr)) + return cpu; + if (sched_energy_enabled()) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e93e006a942b..48d0be005f08 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2131,12 +2131,13 @@ static inline int task_on_rq_migrating(struct task_struct *p) } /* Wake flags. The first three directly map to some SD flag value */ -#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ -#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ -#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ -#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ -#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -3229,6 +3230,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); +extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags); + #ifdef CONFIG_PREEMPT_DYNAMIC extern int preempt_dynamic_mode; extern int sched_dynamic_mode(const char *str); From 6f63904c8f3edb65bd85c1be01d69214ff8ca4c5 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 7 Mar 2023 23:31:58 -0800 Subject: [PATCH 03/10] sched: add a few helpers to wake up tasks on the current cpu Add complete_on_current_cpu, wake_up_poll_on_current_cpu helpers to wake up tasks on the current CPU. These two helpers are useful when the task needs to make a synchronous context switch to another task. In this context, synchronous means it wakes up the target task and falls asleep right after that. One example of such workloads is seccomp user notifies. This mechanism allows the supervisor process handles system calls on behalf of a target process. While the supervisor is handling an intercepted system call, the target process will be blocked in the kernel, waiting for a response to come back. On-CPU context switches are much faster than regular ones. Signed-off-by: Andrei Vagin Acked-by: "Peter Zijlstra (Intel)" Link: https://lore.kernel.org/r/20230308073201.3102738-4-avagin@google.com Signed-off-by: Kees Cook --- include/linux/completion.h | 1 + include/linux/swait.h | 2 +- include/linux/wait.h | 3 +++ kernel/sched/completion.c | 26 ++++++++++++++++++-------- kernel/sched/core.c | 2 +- kernel/sched/swait.c | 8 ++++---- kernel/sched/wait.c | 5 +++++ 7 files changed, 33 insertions(+), 14 deletions(-) diff --git a/include/linux/completion.h b/include/linux/completion.h index 62b32b19e0a8..fb2915676574 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -116,6 +116,7 @@ extern bool try_wait_for_completion(struct completion *x); extern bool completion_done(struct completion *x); extern void complete(struct completion *); +extern void complete_on_current_cpu(struct completion *x); extern void complete_all(struct completion *); #endif diff --git a/include/linux/swait.h b/include/linux/swait.h index 6a8c22b8c2a5..d324419482a0 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -146,7 +146,7 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq) extern void swake_up_one(struct swait_queue_head *q); extern void swake_up_all(struct swait_queue_head *q); -extern void swake_up_locked(struct swait_queue_head *q); +extern void swake_up_locked(struct swait_queue_head *q, int wake_flags); extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state); extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); diff --git a/include/linux/wait.h b/include/linux/wait.h index a0307b516b09..5ec7739400f4 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -210,6 +210,7 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq } int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); +void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, unsigned int mode, void *key, wait_queue_entry_t *bookmark); @@ -237,6 +238,8 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head); #define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m)) #define wake_up_poll(x, m) \ __wake_up(x, TASK_NORMAL, 1, poll_to_key(m)) +#define wake_up_poll_on_current_cpu(x, m) \ + __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m)) #define wake_up_locked_poll(x, m) \ __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m)) #define wake_up_interruptible_poll(x, m) \ diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index d57a5c1c1cd9..3561ab533dd4 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -13,6 +13,23 @@ * Waiting for completion is a typically sync point, but not an exclusion point. */ +static void complete_with_flags(struct completion *x, int wake_flags) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&x->wait.lock, flags); + + if (x->done != UINT_MAX) + x->done++; + swake_up_locked(&x->wait, wake_flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); +} + +void complete_on_current_cpu(struct completion *x) +{ + return complete_with_flags(x, WF_CURRENT_CPU); +} + /** * complete: - signals a single thread waiting on this completion * @x: holds the state of this particular completion @@ -27,14 +44,7 @@ */ void complete(struct completion *x) { - unsigned long flags; - - raw_spin_lock_irqsave(&x->wait.lock, flags); - - if (x->done != UINT_MAX) - x->done++; - swake_up_locked(&x->wait); - raw_spin_unlock_irqrestore(&x->wait.lock, flags); + complete_with_flags(x, 0); } EXPORT_SYMBOL(complete); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1574989627b0..4d63e063608a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7029,7 +7029,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 76b9b796e695..72505cd3b60a 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -18,7 +18,7 @@ EXPORT_SYMBOL(__init_swait_queue_head); * If for some reason it would return 0, that means the previously waiting * task is already running, so it will observe condition true (or has already). */ -void swake_up_locked(struct swait_queue_head *q) +void swake_up_locked(struct swait_queue_head *q, int wake_flags) { struct swait_queue *curr; @@ -26,7 +26,7 @@ void swake_up_locked(struct swait_queue_head *q) return; curr = list_first_entry(&q->task_list, typeof(*curr), task_list); - wake_up_process(curr->task); + try_to_wake_up(curr->task, TASK_NORMAL, wake_flags); list_del_init(&curr->task_list); } EXPORT_SYMBOL(swake_up_locked); @@ -41,7 +41,7 @@ EXPORT_SYMBOL(swake_up_locked); void swake_up_all_locked(struct swait_queue_head *q) { while (!list_empty(&q->task_list)) - swake_up_locked(q); + swake_up_locked(q, 0); } void swake_up_one(struct swait_queue_head *q) @@ -49,7 +49,7 @@ void swake_up_one(struct swait_queue_head *q) unsigned long flags; raw_spin_lock_irqsave(&q->lock, flags); - swake_up_locked(q); + swake_up_locked(q, 0); raw_spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(swake_up_one); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 48c53e4739ea..802d98cf2de3 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -161,6 +161,11 @@ int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, } EXPORT_SYMBOL(__wake_up); +void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key) +{ + __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key); +} + /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ From 48a1084a8b7423642b5f17ca6202f6f277c5392b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 7 Mar 2023 23:31:59 -0800 Subject: [PATCH 04/10] seccomp: add the synchronous mode for seccomp_unotify MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit seccomp_unotify allows more privileged processes do actions on behalf of less privileged processes. In many cases, the workflow is fully synchronous. It means a target process triggers a system call and passes controls to a supervisor process that handles the system call and returns controls to the target process. In this context, "synchronous" means that only one process is running and another one is waiting. There is the WF_CURRENT_CPU flag that is used to advise the scheduler to move the wakee to the current CPU. For such synchronous workflows, it makes context switches a few times faster. Right now, each interaction takes 12µs. With this patch, it takes about 3µs. This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that it used to enable the sync mode. Signed-off-by: Andrei Vagin Acked-by: "Peter Zijlstra (Intel)" Link: https://lore.kernel.org/r/20230308073201.3102738-5-avagin@google.com Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 4 ++++ kernel/seccomp.c | 31 +++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 0fdc6ef02b94..dbfc9b37fcae 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -115,6 +115,8 @@ struct seccomp_notif_resp { __u32 flags; }; +#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0) + /* valid flags for seccomp_notif_addfd */ #define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ #define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */ @@ -150,4 +152,6 @@ struct seccomp_notif_addfd { #define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \ struct seccomp_notif_addfd) +#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64) + #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1386dcedda1a..d3fdc0086168 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -143,9 +143,12 @@ struct seccomp_kaddfd { * filter->notify_lock. * @next_id: The id of the next request. * @notifications: A list of struct seccomp_knotif elements. + * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags. */ + struct notification { atomic_t requests; + u32 flags; u64 next_id; struct list_head notifications; }; @@ -1117,7 +1120,10 @@ static int seccomp_do_user_notification(int this_syscall, INIT_LIST_HEAD(&n.addfd); atomic_inc(&match->notif->requests); - wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); + if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) + wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM); + else + wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); /* * This is where we wait for a reply from userspace. @@ -1593,7 +1599,10 @@ static long seccomp_notify_send(struct seccomp_filter *filter, knotif->error = resp.error; knotif->val = resp.val; knotif->flags = resp.flags; - complete(&knotif->ready); + if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) + complete_on_current_cpu(&knotif->ready); + else + complete(&knotif->ready); out: mutex_unlock(&filter->notify_lock); return ret; @@ -1623,6 +1632,22 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, return ret; } +static long seccomp_notify_set_flags(struct seccomp_filter *filter, + unsigned long flags) +{ + long ret; + + if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) + return -EINVAL; + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + return ret; + filter->notif->flags = flags; + mutex_unlock(&filter->notify_lock); + return 0; +} + static long seccomp_notify_addfd(struct seccomp_filter *filter, struct seccomp_notif_addfd __user *uaddfd, unsigned int size) @@ -1752,6 +1777,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); + case SECCOMP_IOCTL_NOTIF_SET_FLAGS: + return seccomp_notify_set_flags(filter, arg); } /* Extensible Argument ioctls */ From 8feae5adec17ec190118ad84da84a6ad9fa3e30b Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 7 Mar 2023 23:32:00 -0800 Subject: [PATCH 05/10] selftest/seccomp: add a new test for the sync mode of seccomp_user_notify Test output: # RUN global.user_notification_sync ... # OK global.user_notification_sync ok 51 global.user_notification_sync Signed-off-by: Andrei Vagin Acked-by: "Peter Zijlstra (Intel)" Link: https://lore.kernel.org/r/20230308073201.3102738-6-avagin@google.com Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 43ec36b179dc..f6a04d88e02f 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -4255,6 +4255,61 @@ TEST(user_notification_addfd_rlimit) close(memfd); } +#ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP +#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0) +#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64) +#endif + +TEST(user_notification_sync) +{ + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + int status, listener; + pid_t pid; + long ret; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + + /* Try to set invalid flags. */ + EXPECT_SYSCALL_RETURN(-EINVAL, + ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS, 0xffffffff, 0)); + + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS, + SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, 0), 0); + + pid = fork(); + ASSERT_GE(pid, 0); + if (pid == 0) { + ret = syscall(__NR_getppid); + ASSERT_EQ(ret, USER_NOTIF_MAGIC) { + _exit(1); + } + _exit(0); + } + + req.pid = 0; + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + + ASSERT_EQ(req.data.nr, __NR_getppid); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + resp.flags = 0; + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_EQ(status, 0); +} + + /* Make sure PTRACE_O_SUSPEND_SECCOMP requires CAP_SYS_ADMIN. */ FIXTURE(O_SUSPEND_SECCOMP) { pid_t pid; From 7d5cb68af638fc370e89df4c2d8d8c4708600e67 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 7 Mar 2023 23:32:01 -0800 Subject: [PATCH 06/10] perf/benchmark: add a new benchmark for seccom_unotify The benchmark is similar to the pipe benchmark. It creates two processes, one is calling syscalls, and another process is handling them via seccomp user notifications. It measures the time required to run a specified number of interations. $ ./perf bench sched seccomp-notify --sync-mode --loop 1000000 # Running 'sched/seccomp-notify' benchmark: # Executed 1000000 system calls Total time: 2.769 [sec] 2.769629 usecs/op 361059 ops/sec $ ./perf bench sched seccomp-notify # Running 'sched/seccomp-notify' benchmark: # Executed 1000000 system calls Total time: 8.571 [sec] 8.571119 usecs/op 116670 ops/sec Signed-off-by: Andrei Vagin Acked-by: "Peter Zijlstra (Intel)" Link: https://lore.kernel.org/r/20230308073201.3102738-7-avagin@google.com Link: https://lore.kernel.org/r/20230630051953.454638-1-avagin@gmail.com [kees: Added PRIu64 format string] Signed-off-by: Kees Cook --- tools/arch/x86/include/uapi/asm/unistd_32.h | 3 + tools/arch/x86/include/uapi/asm/unistd_64.h | 3 + tools/perf/bench/Build | 1 + tools/perf/bench/bench.h | 1 + tools/perf/bench/sched-seccomp-notify.c | 178 ++++++++++++++++++++ tools/perf/builtin-bench.c | 1 + 6 files changed, 187 insertions(+) create mode 100644 tools/perf/bench/sched-seccomp-notify.c diff --git a/tools/arch/x86/include/uapi/asm/unistd_32.h b/tools/arch/x86/include/uapi/asm/unistd_32.h index bc48a4dabe5d..4798f9d18fe8 100644 --- a/tools/arch/x86/include/uapi/asm/unistd_32.h +++ b/tools/arch/x86/include/uapi/asm/unistd_32.h @@ -26,3 +26,6 @@ #ifndef __NR_setns #define __NR_setns 346 #endif +#ifdef __NR_seccomp +#define __NR_seccomp 354 +#endif diff --git a/tools/arch/x86/include/uapi/asm/unistd_64.h b/tools/arch/x86/include/uapi/asm/unistd_64.h index f70d2cada256..d0f2043d7132 100644 --- a/tools/arch/x86/include/uapi/asm/unistd_64.h +++ b/tools/arch/x86/include/uapi/asm/unistd_64.h @@ -26,3 +26,6 @@ #ifndef __NR_getcpu #define __NR_getcpu 309 #endif +#ifndef __NR_seccomp +#define __NR_seccomp 317 +#endif diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build index 0f158dc8139b..07bbc449329e 100644 --- a/tools/perf/bench/Build +++ b/tools/perf/bench/Build @@ -1,5 +1,6 @@ perf-y += sched-messaging.o perf-y += sched-pipe.o +perf-y += sched-seccomp-notify.o perf-y += syscall.o perf-y += mem-functions.o perf-y += futex-hash.o diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 0d2b65976212..a0625c77bea3 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -21,6 +21,7 @@ extern struct timeval bench__start, bench__end, bench__runtime; int bench_numa(int argc, const char **argv); int bench_sched_messaging(int argc, const char **argv); int bench_sched_pipe(int argc, const char **argv); +int bench_sched_seccomp_notify(int argc, const char **argv); int bench_syscall_basic(int argc, const char **argv); int bench_syscall_getpgid(int argc, const char **argv); int bench_syscall_fork(int argc, const char **argv); diff --git a/tools/perf/bench/sched-seccomp-notify.c b/tools/perf/bench/sched-seccomp-notify.c new file mode 100644 index 000000000000..b04ebcde4036 --- /dev/null +++ b/tools/perf/bench/sched-seccomp-notify.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "bench.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOOPS_DEFAULT 1000000UL +static uint64_t loops = LOOPS_DEFAULT; +static bool sync_mode; + +static const struct option options[] = { + OPT_U64('l', "loop", &loops, "Specify number of loops"), + OPT_BOOLEAN('s', "sync-mode", &sync_mode, + "Enable the synchronious mode for seccomp notifications"), + OPT_END() +}; + +static const char * const bench_seccomp_usage[] = { + "perf bench sched secccomp-notify ", + NULL +}; + +static int seccomp(unsigned int op, unsigned int flags, void *args) +{ + return syscall(__NR_seccomp, op, flags, args); +} + +static int user_notif_syscall(int nr, unsigned int flags) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog); +} + +#define USER_NOTIF_MAGIC INT_MAX +static void user_notification_sync_loop(int listener) +{ + struct seccomp_notif_resp resp; + struct seccomp_notif req; + uint64_t nr; + + for (nr = 0; nr < loops; nr++) { + memset(&req, 0, sizeof(req)); + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req)) + err(EXIT_FAILURE, "SECCOMP_IOCTL_NOTIF_RECV failed"); + + if (req.data.nr != __NR_gettid) + errx(EXIT_FAILURE, "unexpected syscall: %d", req.data.nr); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + resp.flags = 0; + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp)) + err(EXIT_FAILURE, "SECCOMP_IOCTL_NOTIF_SEND failed"); + } +} + +#ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP +#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0) +#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64) +#endif +int bench_sched_seccomp_notify(int argc, const char **argv) +{ + struct timeval start, stop, diff; + unsigned long long result_usec = 0; + int status, listener; + pid_t pid; + long ret; + + argc = parse_options(argc, argv, options, bench_seccomp_usage, 0); + + gettimeofday(&start, NULL); + + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + listener = user_notif_syscall(__NR_gettid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + if (listener < 0) + err(EXIT_FAILURE, "can't create a notification descriptor"); + + pid = fork(); + if (pid < 0) + err(EXIT_FAILURE, "fork"); + if (pid == 0) { + if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0)) + err(EXIT_FAILURE, "can't set the parent death signal"); + while (1) { + ret = syscall(__NR_gettid); + if (ret == USER_NOTIF_MAGIC) + continue; + break; + } + _exit(1); + } + + if (sync_mode) { + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS, + SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, 0)) + err(EXIT_FAILURE, + "can't set SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP"); + } + user_notification_sync_loop(listener); + + kill(pid, SIGKILL); + if (waitpid(pid, &status, 0) != pid) + err(EXIT_FAILURE, "waitpid(%d) failed", pid); + if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) + errx(EXIT_FAILURE, "unexpected exit code: %d", status); + + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + printf("# Executed %" PRIu64 " system calls\n\n", + loops); + + result_usec = diff.tv_sec * USEC_PER_SEC; + result_usec += diff.tv_usec; + + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); + + printf(" %14lf usecs/op\n", + (double)result_usec / (double)loops); + printf(" %14d ops/sec\n", + (int)((double)loops / + ((double)result_usec / (double)USEC_PER_SEC))); + break; + + case BENCH_FORMAT_SIMPLE: + printf("%lu.%03lu\n", + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); + break; + + default: + /* reaching here is something disaster */ + fprintf(stderr, "Unknown format:%d\n", bench_format); + exit(1); + break; + } + + return 0; +} diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index db435b791a09..5033e8bab276 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -47,6 +47,7 @@ static struct bench numa_benchmarks[] = { static struct bench sched_benchmarks[] = { { "messaging", "Benchmark for scheduling and IPC", bench_sched_messaging }, { "pipe", "Benchmark for pipe() between two processes", bench_sched_pipe }, + { "seccomp-notify", "Benchmark for seccomp user notify", bench_sched_seccomp_notify}, { "all", "Run all scheduler benchmarks", NULL }, { NULL, NULL, NULL } }; From fbc5d382407eb5d6a2eeef245cc2ca278d590645 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 2 Aug 2023 21:04:32 -0700 Subject: [PATCH 07/10] selftests/seccomp: Handle arm32 corner cases better It turns out arm32 doesn't handle syscall -1 gracefully, so skip testing for that. Additionally skip tests that depend on clone3 when it is not available (for example when building the seccomp selftests on an old arm image without clone3 headers). And improve error reporting for when nanosleep fails, as seen on arm32 since v5.15. Cc: Lecopzer Chen Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index f6a04d88e02f..38f651469968 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -2184,6 +2184,9 @@ FIXTURE_TEARDOWN(TRACE_syscall) TEST(negative_ENOSYS) { +#if defined(__arm__) + SKIP(return, "arm32 does not support calling syscall -1"); +#endif /* * There should be no difference between an "internal" skip * and userspace asking for syscall "-1". @@ -3072,7 +3075,8 @@ TEST(syscall_restart) timeout.tv_sec = 1; errno = 0; EXPECT_EQ(0, nanosleep(&timeout, NULL)) { - TH_LOG("Call to nanosleep() failed (errno %d)", errno); + TH_LOG("Call to nanosleep() failed (errno %d: %s)", + errno, strerror(errno)); } /* Read final sync from parent. */ @@ -3908,6 +3912,9 @@ TEST(user_notification_filter_empty) TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); } + if (__NR_clone3 < 0) + SKIP(return, "Test not built with clone3 support"); + pid = sys_clone3(&args, sizeof(args)); ASSERT_GE(pid, 0); @@ -3962,6 +3969,9 @@ TEST(user_notification_filter_empty_threaded) TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); } + if (__NR_clone3 < 0) + SKIP(return, "Test not built with clone3 support"); + pid = sys_clone3(&args, sizeof(args)); ASSERT_GE(pid, 0); From cf007647475b5090819c5fe8da771073145c7334 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 10 Aug 2023 12:54:18 -0700 Subject: [PATCH 08/10] ARM: ptrace: Restore syscall restart tracing Since commit 4e57a4ddf6b0 ("ARM: 9107/1: syscall: always store thread_info->abi_syscall"), the seccomp selftests "syscall_restart" has been broken. This was caused by the restart syscall not being stored to "abi_syscall" during restart setup before branching to the "local_restart" label. Tracers would see the wrong syscall, and scno would get overwritten while returning from the TIF_WORK path. Add the missing store. Cc: Russell King Cc: Arnd Bergmann Cc: Lecopzer Chen Cc: Oleg Nesterov Cc: linux-arm-kernel@lists.infradead.org Fixes: 4e57a4ddf6b0 ("ARM: 9107/1: syscall: always store thread_info->abi_syscall") Reviewed-by: Arnd Bergmann Link: https://lore.kernel.org/r/20230810195422.2304827-1-keescook@chromium.org Signed-off-by: Kees Cook --- arch/arm/kernel/entry-common.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index bcc4c9ec3aa4..5c31e9de7a60 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -90,6 +90,7 @@ slow_work_pending: cmp r0, #0 beq no_work_pending movlt scno, #(__NR_restart_syscall - __NR_SYSCALL_BASE) + str scno, [tsk, #TI_ABI_SYSCALL] @ make sure tracers see update ldmia sp, {r0 - r6} @ have to reload r0 - r6 b local_restart @ ... and off we go ENDPROC(ret_fast_syscall) From 4697b5848bd933f68ebd04836362c8de0cacaf71 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 10 Aug 2023 12:54:19 -0700 Subject: [PATCH 09/10] ARM: ptrace: Restore syscall skipping for tracers Since commit 4e57a4ddf6b0 ("ARM: 9107/1: syscall: always store thread_info->abi_syscall"), the seccomp selftests "syscall_errno" and "syscall_faked" have been broken. Both seccomp and PTRACE depend on using the special value of "-1" for skipping syscalls. This value wasn't working because it was getting masked by __NR_SYSCALL_MASK in both PTRACE_SET_SYSCALL and get_syscall_nr(). Explicitly test for -1 in PTRACE_SET_SYSCALL and get_syscall_nr(), leaving it exposed when present, allowing tracers to skip syscalls again. Cc: Russell King Cc: Arnd Bergmann Cc: Lecopzer Chen Cc: Oleg Nesterov Cc: linux-arm-kernel@lists.infradead.org Fixes: 4e57a4ddf6b0 ("ARM: 9107/1: syscall: always store thread_info->abi_syscall") Reviewed-by: Arnd Bergmann Link: https://lore.kernel.org/r/20230810195422.2304827-2-keescook@chromium.org Signed-off-by: Kees Cook --- arch/arm/include/asm/syscall.h | 3 +++ arch/arm/kernel/ptrace.c | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h index dfeed440254a..fe4326d938c1 100644 --- a/arch/arm/include/asm/syscall.h +++ b/arch/arm/include/asm/syscall.h @@ -25,6 +25,9 @@ static inline int syscall_get_nr(struct task_struct *task, if (IS_ENABLED(CONFIG_AEABI) && !IS_ENABLED(CONFIG_OABI_COMPAT)) return task_thread_info(task)->abi_syscall; + if (task_thread_info(task)->abi_syscall == -1) + return -1; + return task_thread_info(task)->abi_syscall & __NR_SYSCALL_MASK; } diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 2d8e2516906b..fef32d73f912 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -783,8 +783,9 @@ long arch_ptrace(struct task_struct *child, long request, break; case PTRACE_SET_SYSCALL: - task_thread_info(child)->abi_syscall = data & - __NR_SYSCALL_MASK; + if (data != -1) + data &= __NR_SYSCALL_MASK; + task_thread_info(child)->abi_syscall = data; ret = 0; break; From 46822860a5a9a5a558475d323a55c8aab0b54012 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Aug 2023 12:32:15 -0700 Subject: [PATCH 10/10] seccomp: Add missing kerndoc notations The kerndoc for some struct member and function arguments were missing. Add them. Cc: Andy Lutomirski Cc: Will Drewry Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308171742.AncabIG1-lkp@intel.com/ Signed-off-by: Kees Cook --- kernel/seccomp.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d3fdc0086168..255999ba9190 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -110,11 +110,13 @@ struct seccomp_knotif { * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC * is allowed. * @ioctl_flags: The flags used for the seccomp_addfd ioctl. + * @setfd: whether or not SECCOMP_ADDFD_FLAG_SETFD was set during notify_addfd * @ret: The return value of the installing process. It is set to the fd num * upon success (>= 0). * @completion: Indicates that the installing process has completed fd * installation, or gone away (either due to successful * reply, or signal) + * @list: list_head for chaining seccomp_kaddfd together. * */ struct seccomp_kaddfd { @@ -138,12 +140,12 @@ struct seccomp_kaddfd { * structure is fairly large, we store the notification-specific stuff in a * separate structure. * - * @request: A semaphore that users of this notification can wait on for - * changes. Actual reads and writes are still controlled with - * filter->notify_lock. + * @requests: A semaphore that users of this notification can wait on for + * changes. Actual reads and writes are still controlled with + * filter->notify_lock. + * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags. * @next_id: The id of the next request. * @notifications: A list of struct seccomp_knotif elements. - * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags. */ struct notification { @@ -558,6 +560,8 @@ static void __seccomp_filter_release(struct seccomp_filter *orig) * drop its reference count, and notify * about unused filters * + * @tsk: task the filter should be released from. + * * This function should only be called when the task is exiting as * it detaches it from its filter tree. As such, READ_ONCE() and * barriers are not needed here, as would normally be needed. @@ -577,6 +581,8 @@ void seccomp_filter_release(struct task_struct *tsk) /** * seccomp_sync_threads: sets all threads to use current's filter * + * @flags: SECCOMP_FILTER_FLAG_* flags to set during sync. + * * Expects sighand and cred_guard_mutex locks to be held, and for * seccomp_can_sync_threads() to have returned success already * without dropping the locks.