2023-09-21 12:31:49 +03:00
|
|
|
/* $NetBSD: kern_event.c,v 1.150 2023/09/21 09:31:50 msaitoh Exp $ */
|
2008-03-22 00:53:35 +03:00
|
|
|
|
|
|
|
/*-
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
* Copyright (c) 2008, 2009, 2021 The NetBSD Foundation, Inc.
|
2008-03-22 00:53:35 +03:00
|
|
|
* All rights reserved.
|
|
|
|
*
|
2009-04-04 14:12:51 +04:00
|
|
|
* This code is derived from software contributed to The NetBSD Foundation
|
|
|
|
* by Andrew Doran.
|
|
|
|
*
|
2008-03-22 00:53:35 +03:00
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
|
|
|
|
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
|
|
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
|
|
|
|
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
2006-06-08 02:33:33 +04:00
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
/*-
|
|
|
|
* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
|
2020-10-31 04:08:31 +03:00
|
|
|
* Copyright (c) 2009 Apple, Inc
|
2002-10-23 13:10:23 +04:00
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
2008-03-22 00:53:35 +03:00
|
|
|
* FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
|
2002-10-23 13:10:23 +04:00
|
|
|
*/
|
|
|
|
|
2021-10-10 22:11:56 +03:00
|
|
|
#ifdef _KERNEL_OPT
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
#include "opt_ddb.h"
|
2021-10-10 22:11:56 +03:00
|
|
|
#endif /* _KERNEL_OPT */
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
|
2003-06-23 17:14:49 +04:00
|
|
|
#include <sys/cdefs.h>
|
2023-09-21 12:31:49 +03:00
|
|
|
__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.150 2023/09/21 09:31:50 msaitoh Exp $");
|
2003-06-23 17:14:49 +04:00
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
|
|
|
#include <sys/kernel.h>
|
2016-04-04 23:47:57 +03:00
|
|
|
#include <sys/wait.h>
|
2002-10-23 13:10:23 +04:00
|
|
|
#include <sys/proc.h>
|
|
|
|
#include <sys/file.h>
|
|
|
|
#include <sys/select.h>
|
|
|
|
#include <sys/queue.h>
|
|
|
|
#include <sys/event.h>
|
|
|
|
#include <sys/eventvar.h>
|
|
|
|
#include <sys/poll.h>
|
2008-03-22 00:53:35 +03:00
|
|
|
#include <sys/kmem.h>
|
2002-10-23 13:10:23 +04:00
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <sys/filedesc.h>
|
|
|
|
#include <sys/syscallargs.h>
|
2006-05-15 01:15:11 +04:00
|
|
|
#include <sys/kauth.h>
|
2007-07-21 23:23:03 +04:00
|
|
|
#include <sys/conf.h>
|
2008-03-22 00:53:35 +03:00
|
|
|
#include <sys/atomic.h>
|
|
|
|
|
|
|
|
static int kqueue_scan(file_t *, size_t, struct kevent *,
|
|
|
|
const struct timespec *, register_t *,
|
|
|
|
const struct kevent_ops *, struct kevent *,
|
|
|
|
size_t);
|
|
|
|
static int kqueue_ioctl(file_t *, u_long, void *);
|
|
|
|
static int kqueue_fcntl(file_t *, u_int, void *);
|
|
|
|
static int kqueue_poll(file_t *, int);
|
|
|
|
static int kqueue_kqfilter(file_t *, struct knote *);
|
|
|
|
static int kqueue_stat(file_t *, struct stat *);
|
|
|
|
static int kqueue_close(file_t *);
|
2021-05-02 22:13:43 +03:00
|
|
|
static void kqueue_restart(file_t *);
|
2023-04-22 16:52:54 +03:00
|
|
|
static int kqueue_fpathconf(file_t *, int, register_t *);
|
2008-03-22 00:53:35 +03:00
|
|
|
static int kqueue_register(struct kqueue *, struct kevent *);
|
|
|
|
static void kqueue_doclose(struct kqueue *, struct klist *, int);
|
|
|
|
|
|
|
|
static void knote_detach(struct knote *, filedesc_t *fdp, bool);
|
|
|
|
static void knote_enqueue(struct knote *);
|
|
|
|
static void knote_activate(struct knote *);
|
2021-10-21 03:54:15 +03:00
|
|
|
static void knote_activate_locked(struct knote *);
|
2021-10-22 07:49:24 +03:00
|
|
|
static void knote_deactivate_locked(struct knote *);
|
2008-03-22 00:53:35 +03:00
|
|
|
|
|
|
|
static void filt_kqdetach(struct knote *);
|
|
|
|
static int filt_kqueue(struct knote *, long hint);
|
|
|
|
static int filt_procattach(struct knote *);
|
|
|
|
static void filt_procdetach(struct knote *);
|
|
|
|
static int filt_proc(struct knote *, long hint);
|
|
|
|
static int filt_fileattach(struct knote *);
|
|
|
|
static void filt_timerexpire(void *x);
|
|
|
|
static int filt_timerattach(struct knote *);
|
|
|
|
static void filt_timerdetach(struct knote *);
|
|
|
|
static int filt_timer(struct knote *, long hint);
|
2021-10-22 07:49:24 +03:00
|
|
|
static int filt_timertouch(struct knote *, struct kevent *, long type);
|
2020-10-31 04:08:31 +03:00
|
|
|
static int filt_userattach(struct knote *);
|
|
|
|
static void filt_userdetach(struct knote *);
|
|
|
|
static int filt_user(struct knote *, long hint);
|
2021-10-21 05:34:03 +03:00
|
|
|
static int filt_usertouch(struct knote *, struct kevent *, long type);
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2022-07-19 03:46:00 +03:00
|
|
|
/*
|
|
|
|
* Private knote state that should never be exposed outside
|
|
|
|
* of kern_event.c
|
|
|
|
*
|
|
|
|
* Field locking:
|
|
|
|
*
|
|
|
|
* q kn_kq->kq_lock
|
|
|
|
*/
|
|
|
|
struct knote_impl {
|
|
|
|
struct knote ki_knote;
|
|
|
|
unsigned int ki_influx; /* q: in-flux counter */
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
kmutex_t ki_foplock; /* for kn_filterops */
|
2022-07-19 03:46:00 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
#define KIMPL_TO_KNOTE(kip) (&(kip)->ki_knote)
|
|
|
|
#define KNOTE_TO_KIMPL(knp) container_of((knp), struct knote_impl, ki_knote)
|
|
|
|
|
|
|
|
static inline struct knote *
|
|
|
|
knote_alloc(bool sleepok)
|
|
|
|
{
|
|
|
|
struct knote_impl *ki;
|
|
|
|
|
|
|
|
ki = kmem_zalloc(sizeof(*ki), sleepok ? KM_SLEEP : KM_NOSLEEP);
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
mutex_init(&ki->ki_foplock, MUTEX_DEFAULT, IPL_NONE);
|
2022-07-19 03:46:00 +03:00
|
|
|
|
|
|
|
return KIMPL_TO_KNOTE(ki);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
knote_free(struct knote *kn)
|
|
|
|
{
|
|
|
|
struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
|
|
|
|
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
mutex_destroy(&ki->ki_foplock);
|
2022-07-19 03:46:00 +03:00
|
|
|
kmem_free(ki, sizeof(*ki));
|
|
|
|
}
|
|
|
|
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
static inline void
|
|
|
|
knote_foplock_enter(struct knote *kn)
|
|
|
|
{
|
|
|
|
mutex_enter(&KNOTE_TO_KIMPL(kn)->ki_foplock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
knote_foplock_exit(struct knote *kn)
|
|
|
|
{
|
|
|
|
mutex_exit(&KNOTE_TO_KIMPL(kn)->ki_foplock);
|
|
|
|
}
|
|
|
|
|
2022-07-24 22:23:44 +03:00
|
|
|
static inline bool __diagused
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
knote_foplock_owned(struct knote *kn)
|
|
|
|
{
|
|
|
|
return mutex_owned(&KNOTE_TO_KIMPL(kn)->ki_foplock);
|
|
|
|
}
|
|
|
|
|
2004-11-30 07:25:43 +03:00
|
|
|
static const struct fileops kqueueops = {
|
2017-11-30 23:25:54 +03:00
|
|
|
.fo_name = "kqueue",
|
2009-04-04 14:12:51 +04:00
|
|
|
.fo_read = (void *)enxio,
|
|
|
|
.fo_write = (void *)enxio,
|
|
|
|
.fo_ioctl = kqueue_ioctl,
|
|
|
|
.fo_fcntl = kqueue_fcntl,
|
|
|
|
.fo_poll = kqueue_poll,
|
|
|
|
.fo_stat = kqueue_stat,
|
|
|
|
.fo_close = kqueue_close,
|
|
|
|
.fo_kqfilter = kqueue_kqfilter,
|
2021-05-02 22:13:43 +03:00
|
|
|
.fo_restart = kqueue_restart,
|
2023-04-22 16:52:54 +03:00
|
|
|
.fo_fpathconf = kqueue_fpathconf,
|
2002-10-23 13:10:23 +04:00
|
|
|
};
|
|
|
|
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
static void
|
|
|
|
filt_nopdetach(struct knote *kn __unused)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
filt_nopevent(struct knote *kn __unused, long hint __unused)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct filterops nop_fd_filtops = {
|
|
|
|
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
|
|
|
|
.f_attach = NULL,
|
|
|
|
.f_detach = filt_nopdetach,
|
|
|
|
.f_event = filt_nopevent,
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct filterops nop_filtops = {
|
|
|
|
.f_flags = FILTEROP_MPSAFE,
|
|
|
|
.f_attach = NULL,
|
|
|
|
.f_detach = filt_nopdetach,
|
|
|
|
.f_event = filt_nopevent,
|
|
|
|
};
|
|
|
|
|
2017-10-25 11:12:37 +03:00
|
|
|
static const struct filterops kqread_filtops = {
|
2021-09-26 21:13:58 +03:00
|
|
|
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
|
2017-10-25 11:12:37 +03:00
|
|
|
.f_attach = NULL,
|
|
|
|
.f_detach = filt_kqdetach,
|
|
|
|
.f_event = filt_kqueue,
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct filterops proc_filtops = {
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
.f_flags = FILTEROP_MPSAFE,
|
2017-10-25 11:12:37 +03:00
|
|
|
.f_attach = filt_procattach,
|
|
|
|
.f_detach = filt_procdetach,
|
|
|
|
.f_event = filt_proc,
|
|
|
|
};
|
|
|
|
|
2021-09-26 06:12:50 +03:00
|
|
|
/*
|
|
|
|
* file_filtops is not marked MPSAFE because it's going to call
|
|
|
|
* fileops::fo_kqfilter(), which might not be. That function,
|
|
|
|
* however, will override the knote's filterops, and thus will
|
|
|
|
* inherit the MPSAFE-ness of the back-end at that time.
|
|
|
|
*/
|
2017-10-25 11:12:37 +03:00
|
|
|
static const struct filterops file_filtops = {
|
2021-09-26 04:16:07 +03:00
|
|
|
.f_flags = FILTEROP_ISFD,
|
2017-10-25 11:12:37 +03:00
|
|
|
.f_attach = filt_fileattach,
|
|
|
|
.f_detach = NULL,
|
|
|
|
.f_event = NULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct filterops timer_filtops = {
|
2021-09-27 02:34:46 +03:00
|
|
|
.f_flags = FILTEROP_MPSAFE,
|
2017-10-25 11:12:37 +03:00
|
|
|
.f_attach = filt_timerattach,
|
|
|
|
.f_detach = filt_timerdetach,
|
|
|
|
.f_event = filt_timer,
|
2021-10-22 07:49:24 +03:00
|
|
|
.f_touch = filt_timertouch,
|
2017-10-25 11:12:37 +03:00
|
|
|
};
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2020-10-31 04:08:31 +03:00
|
|
|
static const struct filterops user_filtops = {
|
2021-09-26 21:13:58 +03:00
|
|
|
.f_flags = FILTEROP_MPSAFE,
|
2020-10-31 04:08:31 +03:00
|
|
|
.f_attach = filt_userattach,
|
|
|
|
.f_detach = filt_userdetach,
|
|
|
|
.f_event = filt_user,
|
|
|
|
.f_touch = filt_usertouch,
|
|
|
|
};
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
static u_int kq_ncallouts = 0;
|
2003-02-04 12:02:04 +03:00
|
|
|
static int kq_calloutmax = (4 * 1024);
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
#define KN_HASHSIZE 64 /* XXX should be tunable */
|
|
|
|
#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
|
|
|
|
|
2021-09-27 00:29:38 +03:00
|
|
|
extern const struct filterops fs_filtops; /* vfs_syscalls.c */
|
|
|
|
extern const struct filterops sig_filtops; /* kern_sig.c */
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
/*
|
2023-09-21 12:31:49 +03:00
|
|
|
* Table for all system-defined filters.
|
2002-10-23 13:10:23 +04:00
|
|
|
* These should be listed in the numeric order of the EVFILT_* defines.
|
|
|
|
* If filtops is NULL, the filter isn't implemented in NetBSD.
|
|
|
|
* End of list is when name is NULL.
|
2017-07-03 03:53:33 +03:00
|
|
|
*
|
2008-03-22 00:53:35 +03:00
|
|
|
* Note that 'refcnt' is meaningless for built-in filters.
|
2002-10-23 13:10:23 +04:00
|
|
|
*/
|
|
|
|
struct kfilter {
|
2008-03-22 00:53:35 +03:00
|
|
|
const char *name; /* name of filter */
|
|
|
|
uint32_t filter; /* id of filter */
|
|
|
|
unsigned refcnt; /* reference count */
|
2002-10-23 13:10:23 +04:00
|
|
|
const struct filterops *filtops;/* operations for filter */
|
2008-03-22 00:53:35 +03:00
|
|
|
size_t namelen; /* length of name string */
|
2002-10-23 13:10:23 +04:00
|
|
|
};
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
/* System defined filters */
|
|
|
|
static struct kfilter sys_kfilters[] = {
|
|
|
|
{ "EVFILT_READ", EVFILT_READ, 0, &file_filtops, 0 },
|
|
|
|
{ "EVFILT_WRITE", EVFILT_WRITE, 0, &file_filtops, 0, },
|
|
|
|
{ "EVFILT_AIO", EVFILT_AIO, 0, NULL, 0 },
|
|
|
|
{ "EVFILT_VNODE", EVFILT_VNODE, 0, &file_filtops, 0 },
|
|
|
|
{ "EVFILT_PROC", EVFILT_PROC, 0, &proc_filtops, 0 },
|
|
|
|
{ "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 },
|
|
|
|
{ "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 },
|
2018-01-09 06:31:12 +03:00
|
|
|
{ "EVFILT_FS", EVFILT_FS, 0, &fs_filtops, 0 },
|
2020-10-31 04:08:31 +03:00
|
|
|
{ "EVFILT_USER", EVFILT_USER, 0, &user_filtops, 0 },
|
2021-10-23 04:28:33 +03:00
|
|
|
{ "EVFILT_EMPTY", EVFILT_EMPTY, 0, &file_filtops, 0 },
|
2008-03-22 00:53:35 +03:00
|
|
|
{ NULL, 0, 0, NULL, 0 },
|
2002-10-23 13:10:23 +04:00
|
|
|
};
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
/* User defined kfilters */
|
2002-10-23 13:10:23 +04:00
|
|
|
static struct kfilter *user_kfilters; /* array */
|
|
|
|
static int user_kfilterc; /* current offset */
|
|
|
|
static int user_kfiltermaxc; /* max size so far */
|
2008-03-22 00:53:35 +03:00
|
|
|
static size_t user_kfiltersz; /* size of allocated memory */
|
|
|
|
|
2017-10-25 09:02:40 +03:00
|
|
|
/*
|
|
|
|
* Global Locks.
|
|
|
|
*
|
|
|
|
* Lock order:
|
|
|
|
*
|
|
|
|
* kqueue_filter_lock
|
|
|
|
* -> kn_kq->kq_fdp->fd_lock
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
* -> knote foplock (if taken)
|
2021-09-27 02:34:46 +03:00
|
|
|
* -> object lock (e.g., device driver lock, &c.)
|
2017-10-25 09:02:40 +03:00
|
|
|
* -> kn_kq->kq_lock
|
|
|
|
*
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
* Locking rules. ==> indicates the lock is acquired by the backing
|
|
|
|
* object, locks prior are acquired before calling filter ops:
|
|
|
|
*
|
|
|
|
* f_attach: fdp->fd_lock -> knote foplock ->
|
|
|
|
* (maybe) KERNEL_LOCK ==> backing object lock
|
2017-10-25 09:02:40 +03:00
|
|
|
*
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
* f_detach: fdp->fd_lock -> knote foplock ->
|
|
|
|
* (maybe) KERNEL_LOCK ==> backing object lock
|
|
|
|
*
|
|
|
|
* f_event via kevent: fdp->fd_lock -> knote foplock ->
|
|
|
|
* (maybe) KERNEL_LOCK ==> backing object lock
|
|
|
|
* N.B. NOTE_SUBMIT will never be set in the "hint" argument
|
|
|
|
* in this case.
|
|
|
|
*
|
|
|
|
* f_event via knote (via backing object: Whatever caller guarantees.
|
|
|
|
* Typically:
|
|
|
|
* f_event(NOTE_SUBMIT): caller has already acquired backing
|
|
|
|
* object lock.
|
|
|
|
* f_event(!NOTE_SUBMIT): caller has not acquired backing object,
|
|
|
|
* lock or has possibly acquired KERNEL_LOCK. Backing object
|
|
|
|
* lock may or may not be acquired as-needed.
|
|
|
|
* N.B. the knote foplock will **not** be acquired in this case. The
|
|
|
|
* caller guarantees that klist_fini() will not be called concurrently
|
|
|
|
* with knote().
|
|
|
|
*
|
|
|
|
* f_touch: fdp->fd_lock -> kn_kq->kq_lock (spin lock)
|
|
|
|
* N.B. knote foplock is **not** acquired in this case and
|
|
|
|
* the caller must guarantee that klist_fini() will never
|
|
|
|
* be called. kevent_register() restricts filters that
|
|
|
|
* provide f_touch to known-safe cases.
|
|
|
|
*
|
|
|
|
* klist_fini(): Caller must guarantee that no more knotes can
|
|
|
|
* be attached to the klist, and must **not** hold the backing
|
|
|
|
* object's lock; klist_fini() itself will acquire the foplock
|
|
|
|
* of each knote on the klist.
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
*
|
|
|
|
* Locking rules when detaching knotes:
|
|
|
|
*
|
|
|
|
* There are some situations where knote submission may require dropping
|
|
|
|
* locks (see knote_proc_fork()). In order to support this, it's possible
|
|
|
|
* to mark a knote as being 'in-flux'. Such a knote is guaranteed not to
|
|
|
|
* be detached while it remains in-flux. Because it will not be detached,
|
|
|
|
* locks can be dropped so e.g. memory can be allocated, locks on other
|
|
|
|
* data structures can be acquired, etc. During this time, any attempt to
|
|
|
|
* detach an in-flux knote must wait until the knote is no longer in-flux.
|
|
|
|
* When this happens, the knote is marked for death (KN_WILLDETACH) and the
|
|
|
|
* LWP who gets to finish the detach operation is recorded in the knote's
|
|
|
|
* 'udata' field (which is no longer required for its original purpose once
|
|
|
|
* a knote is so marked). Code paths that lead to knote_detach() must ensure
|
|
|
|
* that their LWP is the one tasked with its final demise after waiting for
|
|
|
|
* the in-flux status of the knote to clear. Note that once a knote is
|
|
|
|
* marked KN_WILLDETACH, no code paths may put it into an in-flux state.
|
|
|
|
*
|
|
|
|
* Once the special circumstances have been handled, the locks are re-
|
|
|
|
* acquired in the proper order (object lock -> kq_lock), the knote taken
|
|
|
|
* out of flux, and any waiters are notified. Because waiters must have
|
|
|
|
* also dropped *their* locks in order to safely block, they must re-
|
|
|
|
* validate all of their assumptions; see knote_detach_quiesce(). See also
|
|
|
|
* the kqueue_register() (EV_ADD, EV_DELETE) and kqueue_scan() (EV_ONESHOT)
|
|
|
|
* cases.
|
|
|
|
*
|
|
|
|
* When kqueue_scan() encounters an in-flux knote, the situation is
|
|
|
|
* treated like another LWP's list marker.
|
|
|
|
*
|
|
|
|
* LISTEN WELL: It is important to not hold knotes in flux for an
|
|
|
|
* extended period of time! In-flux knotes effectively block any
|
|
|
|
* progress of the kqueue_scan() operation. Any code paths that place
|
|
|
|
* knotes in-flux should be careful to not block for indefinite periods
|
|
|
|
* of time, such as for memory allocation (i.e. KM_NOSLEEP is OK, but
|
|
|
|
* KM_SLEEP is not).
|
2017-10-25 09:02:40 +03:00
|
|
|
*/
|
2008-03-22 00:53:35 +03:00
|
|
|
static krwlock_t kqueue_filter_lock; /* lock on filter lists */
|
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
#define KQ_FLUX_WAIT(kq) (void)cv_wait(&kq->kq_cv, &kq->kq_lock)
|
|
|
|
#define KQ_FLUX_WAKEUP(kq) cv_broadcast(&kq->kq_cv)
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
kn_in_flux(struct knote *kn)
|
|
|
|
{
|
|
|
|
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
|
2022-07-19 03:46:00 +03:00
|
|
|
return KNOTE_TO_KIMPL(kn)->ki_influx != 0;
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
kn_enter_flux(struct knote *kn)
|
|
|
|
{
|
|
|
|
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
|
|
|
|
|
|
|
|
if (kn->kn_status & KN_WILLDETACH) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2022-07-19 03:46:00 +03:00
|
|
|
struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
|
|
|
|
KASSERT(ki->ki_influx < UINT_MAX);
|
|
|
|
ki->ki_influx++;
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
kn_leave_flux(struct knote *kn)
|
|
|
|
{
|
|
|
|
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
|
2022-07-19 03:46:00 +03:00
|
|
|
|
|
|
|
struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
|
|
|
|
KASSERT(ki->ki_influx > 0);
|
|
|
|
ki->ki_influx--;
|
|
|
|
return ki->ki_influx == 0;
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
kn_wait_flux(struct knote *kn, bool can_loop)
|
|
|
|
{
|
2022-07-19 03:46:00 +03:00
|
|
|
struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
bool loop;
|
|
|
|
|
|
|
|
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* It may not be safe for us to touch the knote again after
|
|
|
|
* dropping the kq_lock. The caller has let us know in
|
|
|
|
* 'can_loop'.
|
|
|
|
*/
|
2022-07-19 03:46:00 +03:00
|
|
|
for (loop = true; loop && ki->ki_influx != 0; loop = can_loop) {
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
KQ_FLUX_WAIT(kn->kn_kq);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#define KNOTE_WILLDETACH(kn) \
|
|
|
|
do { \
|
|
|
|
(kn)->kn_status |= KN_WILLDETACH; \
|
|
|
|
(kn)->kn_kevent.udata = curlwp; \
|
|
|
|
} while (/*CONSTCOND*/0)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait until the specified knote is in a quiescent state and
|
|
|
|
* safe to detach. Returns true if we potentially blocked (and
|
|
|
|
* thus dropped our locks).
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
knote_detach_quiesce(struct knote *kn)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
filedesc_t *fdp = kq->kq_fdp;
|
|
|
|
|
|
|
|
KASSERT(mutex_owned(&fdp->fd_lock));
|
|
|
|
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
/*
|
|
|
|
* There are two cases where we might see KN_WILLDETACH here:
|
|
|
|
*
|
|
|
|
* 1. Someone else has already started detaching the knote but
|
|
|
|
* had to wait for it to settle first.
|
|
|
|
*
|
|
|
|
* 2. We had to wait for it to settle, and had to come back
|
|
|
|
* around after re-acquiring the locks.
|
|
|
|
*
|
|
|
|
* When KN_WILLDETACH is set, we also set the LWP that claimed
|
|
|
|
* the prize of finishing the detach in the 'udata' field of the
|
|
|
|
* knote (which will never be used again for its usual purpose
|
|
|
|
* once the note is in this state). If it doesn't point to us,
|
|
|
|
* we must drop the locks and let them in to finish the job.
|
|
|
|
*
|
|
|
|
* Otherwise, once we have claimed the knote for ourselves, we
|
|
|
|
* can finish waiting for it to settle. The is the only scenario
|
|
|
|
* where touching a detaching knote is safe after dropping the
|
|
|
|
* locks.
|
|
|
|
*/
|
|
|
|
if ((kn->kn_status & KN_WILLDETACH) != 0 &&
|
|
|
|
kn->kn_kevent.udata != curlwp) {
|
|
|
|
/*
|
|
|
|
* N.B. it is NOT safe for us to touch the knote again
|
|
|
|
* after dropping the locks here. The caller must go
|
|
|
|
* back around and re-validate everything. However, if
|
|
|
|
* the knote is in-flux, we want to block to minimize
|
|
|
|
* busy-looping.
|
|
|
|
*/
|
|
|
|
mutex_exit(&fdp->fd_lock);
|
|
|
|
if (kn_in_flux(kn)) {
|
|
|
|
kn_wait_flux(kn, false);
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
preempt_point();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If we get here, we know that we will be claiming the
|
|
|
|
* detach responsibilies, or that we already have and
|
|
|
|
* this is the second attempt after re-validation.
|
|
|
|
*/
|
|
|
|
KASSERT((kn->kn_status & KN_WILLDETACH) == 0 ||
|
|
|
|
kn->kn_kevent.udata == curlwp);
|
|
|
|
/*
|
|
|
|
* Similarly, if we get here, either we are just claiming it
|
|
|
|
* and may have to wait for it to settle, or if this is the
|
|
|
|
* second attempt after re-validation that no other code paths
|
|
|
|
* have put it in-flux.
|
|
|
|
*/
|
|
|
|
KASSERT((kn->kn_status & KN_WILLDETACH) == 0 ||
|
|
|
|
kn_in_flux(kn) == false);
|
|
|
|
KNOTE_WILLDETACH(kn);
|
|
|
|
if (kn_in_flux(kn)) {
|
|
|
|
mutex_exit(&fdp->fd_lock);
|
|
|
|
kn_wait_flux(kn, true);
|
|
|
|
/*
|
|
|
|
* It is safe for us to touch the knote again after
|
|
|
|
* dropping the locks, but the caller must still
|
|
|
|
* re-validate everything because other aspects of
|
|
|
|
* the environment may have changed while we blocked.
|
|
|
|
*/
|
|
|
|
KASSERT(kn_in_flux(kn) == false);
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
/*
|
|
|
|
* Calls into the filterops need to be resilient against things which
|
|
|
|
* destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid
|
|
|
|
* chasing garbage pointers (to data, or even potentially code in a
|
|
|
|
* module about to be unloaded). To that end, we acquire the
|
|
|
|
* knote foplock before calling into the filter ops. When a driver
|
|
|
|
* (or anything else) is tearing down its klist, klist_fini() enumerates
|
|
|
|
* each knote, acquires its foplock, and replaces the filterops with a
|
|
|
|
* nop stub, allowing knote detach (when descriptors are closed) to safely
|
|
|
|
* proceed.
|
|
|
|
*/
|
|
|
|
|
2021-09-26 06:12:50 +03:00
|
|
|
static int
|
|
|
|
filter_attach(struct knote *kn)
|
|
|
|
{
|
|
|
|
int rv;
|
|
|
|
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
KASSERT(knote_foplock_owned(kn));
|
2021-09-26 06:12:50 +03:00
|
|
|
KASSERT(kn->kn_fop != NULL);
|
|
|
|
KASSERT(kn->kn_fop->f_attach != NULL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* N.B. that kn->kn_fop may change as the result of calling
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
* f_attach(). After f_attach() returns, kn->kn_fop may not
|
|
|
|
* be modified by code outside of klist_fini().
|
2021-09-26 06:12:50 +03:00
|
|
|
*/
|
|
|
|
if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
|
|
|
|
rv = kn->kn_fop->f_attach(kn);
|
|
|
|
} else {
|
|
|
|
KERNEL_LOCK(1, NULL);
|
|
|
|
rv = kn->kn_fop->f_attach(kn);
|
|
|
|
KERNEL_UNLOCK_ONE(NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
return rv;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
filter_detach(struct knote *kn)
|
|
|
|
{
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
|
|
|
|
KASSERT(knote_foplock_owned(kn));
|
2021-09-26 06:12:50 +03:00
|
|
|
KASSERT(kn->kn_fop != NULL);
|
|
|
|
KASSERT(kn->kn_fop->f_detach != NULL);
|
|
|
|
|
|
|
|
if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
|
|
|
|
kn->kn_fop->f_detach(kn);
|
|
|
|
} else {
|
|
|
|
KERNEL_LOCK(1, NULL);
|
|
|
|
kn->kn_fop->f_detach(kn);
|
|
|
|
KERNEL_UNLOCK_ONE(NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
filter_event(struct knote *kn, long hint, bool submitting)
|
2021-09-26 06:12:50 +03:00
|
|
|
{
|
|
|
|
int rv;
|
|
|
|
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
/* See knote(). */
|
|
|
|
KASSERT(submitting || knote_foplock_owned(kn));
|
2021-09-26 06:12:50 +03:00
|
|
|
KASSERT(kn->kn_fop != NULL);
|
|
|
|
KASSERT(kn->kn_fop->f_event != NULL);
|
|
|
|
|
|
|
|
if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
|
|
|
|
rv = kn->kn_fop->f_event(kn, hint);
|
|
|
|
} else {
|
|
|
|
KERNEL_LOCK(1, NULL);
|
|
|
|
rv = kn->kn_fop->f_event(kn, hint);
|
|
|
|
KERNEL_UNLOCK_ONE(NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
return rv;
|
|
|
|
}
|
|
|
|
|
2021-10-21 05:34:03 +03:00
|
|
|
static int
|
2021-09-26 06:12:50 +03:00
|
|
|
filter_touch(struct knote *kn, struct kevent *kev, long type)
|
|
|
|
{
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX We cannot assert that the knote foplock is held here
|
|
|
|
* XXX beause we cannot safely acquire it in all cases
|
|
|
|
* XXX where "touch" will be used in kqueue_scan(). We just
|
|
|
|
* XXX have to assume that f_touch will always be safe to call,
|
|
|
|
* XXX and kqueue_register() allows only the two known-safe
|
|
|
|
* XXX users of that op.
|
|
|
|
*/
|
|
|
|
|
|
|
|
KASSERT(kn->kn_fop != NULL);
|
|
|
|
KASSERT(kn->kn_fop->f_touch != NULL);
|
|
|
|
|
2021-10-21 05:34:03 +03:00
|
|
|
return kn->kn_fop->f_touch(kn, kev, type);
|
2021-09-26 06:12:50 +03:00
|
|
|
}
|
|
|
|
|
2009-10-03 04:14:07 +04:00
|
|
|
static kauth_listener_t kqueue_listener;
|
|
|
|
|
|
|
|
static int
|
|
|
|
kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
|
|
|
|
void *arg0, void *arg1, void *arg2, void *arg3)
|
|
|
|
{
|
|
|
|
struct proc *p;
|
|
|
|
int result;
|
|
|
|
|
|
|
|
result = KAUTH_RESULT_DEFER;
|
|
|
|
p = arg0;
|
|
|
|
|
|
|
|
if (action != KAUTH_PROCESS_KEVENT_FILTER)
|
|
|
|
return result;
|
|
|
|
|
|
|
|
if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) ||
|
|
|
|
ISSET(p->p_flag, PK_SUGID)))
|
|
|
|
return result;
|
|
|
|
|
|
|
|
result = KAUTH_RESULT_ALLOW;
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
/*
|
|
|
|
* Initialize the kqueue subsystem.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
kqueue_init(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
rw_init(&kqueue_filter_lock);
|
2009-10-03 04:14:07 +04:00
|
|
|
|
|
|
|
kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
|
|
|
|
kqueue_listener_cb, NULL);
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Find kfilter entry by name, or NULL if not found.
|
|
|
|
*/
|
2008-03-22 00:53:35 +03:00
|
|
|
static struct kfilter *
|
2002-10-23 13:10:23 +04:00
|
|
|
kfilter_byname_sys(const char *name)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
KASSERT(rw_lock_held(&kqueue_filter_lock));
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
for (i = 0; sys_kfilters[i].name != NULL; i++) {
|
|
|
|
if (strcmp(name, sys_kfilters[i].name) == 0)
|
2008-03-22 00:53:35 +03:00
|
|
|
return &sys_kfilters[i];
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
return NULL;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct kfilter *
|
|
|
|
kfilter_byname_user(const char *name)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
KASSERT(rw_lock_held(&kqueue_filter_lock));
|
|
|
|
|
2006-09-30 06:39:18 +04:00
|
|
|
/* user filter slots have a NULL name if previously deregistered */
|
|
|
|
for (i = 0; i < user_kfilterc ; i++) {
|
|
|
|
if (user_kfilters[i].name != NULL &&
|
2002-10-23 13:10:23 +04:00
|
|
|
strcmp(name, user_kfilters[i].name) == 0)
|
2008-03-22 00:53:35 +03:00
|
|
|
return &user_kfilters[i];
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
return NULL;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
static struct kfilter *
|
2002-10-23 13:10:23 +04:00
|
|
|
kfilter_byname(const char *name)
|
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
struct kfilter *kfilter;
|
|
|
|
|
|
|
|
KASSERT(rw_lock_held(&kqueue_filter_lock));
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
if ((kfilter = kfilter_byname_sys(name)) != NULL)
|
2008-03-22 00:53:35 +03:00
|
|
|
return kfilter;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
return kfilter_byname_user(name);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find kfilter entry by filter id, or NULL if not found.
|
|
|
|
* Assumes entries are indexed in filter id order, for speed.
|
|
|
|
*/
|
2008-03-22 00:53:35 +03:00
|
|
|
static struct kfilter *
|
2002-10-23 13:10:23 +04:00
|
|
|
kfilter_byfilter(uint32_t filter)
|
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
struct kfilter *kfilter;
|
|
|
|
|
|
|
|
KASSERT(rw_lock_held(&kqueue_filter_lock));
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
if (filter < EVFILT_SYSCOUNT) /* it's a system filter */
|
|
|
|
kfilter = &sys_kfilters[filter];
|
|
|
|
else if (user_kfilters != NULL &&
|
|
|
|
filter < EVFILT_SYSCOUNT + user_kfilterc)
|
|
|
|
/* it's a user filter */
|
|
|
|
kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
|
|
|
|
else
|
|
|
|
return (NULL); /* out of range */
|
|
|
|
KASSERT(kfilter->filter == filter); /* sanity check! */
|
|
|
|
return (kfilter);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Register a new kfilter. Stores the entry in user_kfilters.
|
|
|
|
* Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
|
|
|
|
* If retfilter != NULL, the new filterid is returned in it.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
kfilter_register(const char *name, const struct filterops *filtops,
|
2008-03-22 00:53:35 +03:00
|
|
|
int *retfilter)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
|
|
|
struct kfilter *kfilter;
|
2008-03-22 00:53:35 +03:00
|
|
|
size_t len;
|
2006-09-30 06:39:18 +04:00
|
|
|
int i;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
if (name == NULL || name[0] == '\0' || filtops == NULL)
|
|
|
|
return (EINVAL); /* invalid args */
|
2008-03-22 00:53:35 +03:00
|
|
|
|
|
|
|
rw_enter(&kqueue_filter_lock, RW_WRITER);
|
|
|
|
if (kfilter_byname(name) != NULL) {
|
|
|
|
rw_exit(&kqueue_filter_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
return (EEXIST); /* already exists */
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
|
|
|
if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
|
|
|
|
rw_exit(&kqueue_filter_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
return (EINVAL); /* too many */
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2006-09-30 06:39:18 +04:00
|
|
|
for (i = 0; i < user_kfilterc; i++) {
|
|
|
|
kfilter = &user_kfilters[i];
|
|
|
|
if (kfilter->name == NULL) {
|
|
|
|
/* Previously deregistered slot. Reuse. */
|
|
|
|
goto reuse;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
/* check if need to grow user_kfilters */
|
|
|
|
if (user_kfilterc + 1 > user_kfiltermaxc) {
|
2008-03-22 00:53:35 +03:00
|
|
|
/* Grow in KFILTER_EXTENT chunks. */
|
2002-10-23 13:10:23 +04:00
|
|
|
user_kfiltermaxc += KFILTER_EXTENT;
|
2009-12-22 23:50:46 +03:00
|
|
|
len = user_kfiltermaxc * sizeof(*kfilter);
|
2008-03-22 00:53:35 +03:00
|
|
|
kfilter = kmem_alloc(len, KM_SLEEP);
|
|
|
|
memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
|
|
|
|
if (user_kfilters != NULL) {
|
|
|
|
memcpy(kfilter, user_kfilters, user_kfiltersz);
|
|
|
|
kmem_free(user_kfilters, user_kfiltersz);
|
|
|
|
}
|
|
|
|
user_kfiltersz = len;
|
2002-10-23 13:10:23 +04:00
|
|
|
user_kfilters = kfilter;
|
|
|
|
}
|
2006-09-30 06:39:18 +04:00
|
|
|
/* Adding new slot */
|
|
|
|
kfilter = &user_kfilters[user_kfilterc++];
|
|
|
|
reuse:
|
2017-11-07 21:35:57 +03:00
|
|
|
kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP);
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2006-09-30 06:39:18 +04:00
|
|
|
kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
|
|
|
|
memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
if (retfilter != NULL)
|
2006-09-30 06:39:18 +04:00
|
|
|
*retfilter = kfilter->filter;
|
2008-03-22 00:53:35 +03:00
|
|
|
rw_exit(&kqueue_filter_lock);
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Unregister a kfilter previously registered with kfilter_register.
|
|
|
|
* This retains the filter id, but clears the name and frees filtops (filter
|
|
|
|
* operations), so that the number isn't reused during a boot.
|
|
|
|
* Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
kfilter_unregister(const char *name)
|
|
|
|
{
|
|
|
|
struct kfilter *kfilter;
|
|
|
|
|
|
|
|
if (name == NULL || name[0] == '\0')
|
|
|
|
return (EINVAL); /* invalid name */
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
rw_enter(&kqueue_filter_lock, RW_WRITER);
|
|
|
|
if (kfilter_byname_sys(name) != NULL) {
|
|
|
|
rw_exit(&kqueue_filter_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
return (EINVAL); /* can't detach system filters */
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
kfilter = kfilter_byname_user(name);
|
2008-03-22 00:53:35 +03:00
|
|
|
if (kfilter == NULL) {
|
|
|
|
rw_exit(&kqueue_filter_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
return (ENOENT);
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
|
|
|
if (kfilter->refcnt != 0) {
|
|
|
|
rw_exit(&kqueue_filter_lock);
|
|
|
|
return (EBUSY);
|
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
/* Cast away const (but we know it's safe. */
|
|
|
|
kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
|
2006-09-30 06:39:18 +04:00
|
|
|
kfilter->name = NULL; /* mark as `not implemented' */
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
if (kfilter->filtops != NULL) {
|
2008-03-22 00:53:35 +03:00
|
|
|
/* Cast away const (but we know it's safe. */
|
|
|
|
kmem_free(__UNCONST(kfilter->filtops),
|
|
|
|
sizeof(*kfilter->filtops));
|
2002-10-23 13:10:23 +04:00
|
|
|
kfilter->filtops = NULL; /* mark as `not implemented' */
|
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
rw_exit(&kqueue_filter_lock);
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
|
2008-03-22 00:53:35 +03:00
|
|
|
* descriptors. Calls fileops kqfilter method for given file descriptor.
|
2002-10-23 13:10:23 +04:00
|
|
|
*/
|
|
|
|
static int
|
|
|
|
filt_fileattach(struct knote *kn)
|
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
file_t *fp;
|
|
|
|
|
|
|
|
fp = kn->kn_obj;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
return (*fp->f_ops->fo_kqfilter)(fp, kn);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Filter detach method for EVFILT_READ on kqueue descriptor.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
filt_kqdetach(struct knote *kn)
|
|
|
|
{
|
|
|
|
struct kqueue *kq;
|
|
|
|
|
2014-09-05 13:20:59 +04:00
|
|
|
kq = ((file_t *)kn->kn_obj)->f_kqueue;
|
2008-03-22 00:53:35 +03:00
|
|
|
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
2020-12-11 06:00:09 +03:00
|
|
|
selremove_knote(&kq->kq_sel, kn);
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Filter event method for EVFILT_READ on kqueue descriptor.
|
|
|
|
*/
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
2006-11-01 13:17:58 +03:00
|
|
|
filt_kqueue(struct knote *kn, long hint)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
|
|
|
struct kqueue *kq;
|
2008-03-22 00:53:35 +03:00
|
|
|
int rv;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2014-09-05 13:20:59 +04:00
|
|
|
kq = ((file_t *)kn->kn_obj)->f_kqueue;
|
2008-03-22 00:53:35 +03:00
|
|
|
|
|
|
|
if (hint != NOTE_SUBMIT)
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
2021-05-02 22:13:43 +03:00
|
|
|
kn->kn_data = KQ_COUNT(kq);
|
2008-03-22 00:53:35 +03:00
|
|
|
rv = (kn->kn_data > 0);
|
|
|
|
if (hint != NOTE_SUBMIT)
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
|
|
|
|
return rv;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Filter attach method for EVFILT_PROC.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
filt_procattach(struct knote *kn)
|
|
|
|
{
|
2012-11-18 22:36:01 +04:00
|
|
|
struct proc *p;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2020-05-24 02:42:41 +03:00
|
|
|
mutex_enter(&proc_lock);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
p = proc_find(kn->kn_id);
|
2008-03-22 00:53:35 +03:00
|
|
|
if (p == NULL) {
|
2020-05-24 02:42:41 +03:00
|
|
|
mutex_exit(&proc_lock);
|
2008-03-22 00:53:35 +03:00
|
|
|
return ESRCH;
|
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Fail if it's not owned by you, or the last exec gave us
|
|
|
|
* setuid/setgid privs (unless you're root).
|
|
|
|
*/
|
2008-04-24 22:39:20 +04:00
|
|
|
mutex_enter(p->p_lock);
|
2020-05-24 02:42:41 +03:00
|
|
|
mutex_exit(&proc_lock);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
if (kauth_authorize_process(curlwp->l_cred,
|
2021-09-21 17:49:01 +03:00
|
|
|
KAUTH_PROCESS_KEVENT_FILTER, p, NULL, NULL, NULL) != 0) {
|
2008-04-24 22:39:20 +04:00
|
|
|
mutex_exit(p->p_lock);
|
2008-03-22 00:53:35 +03:00
|
|
|
return EACCES;
|
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
kn->kn_obj = p;
|
2002-10-23 13:10:23 +04:00
|
|
|
kn->kn_flags |= EV_CLEAR; /* automatically set */
|
|
|
|
|
|
|
|
/*
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
* NOTE_CHILD is only ever generated internally; don't let it
|
|
|
|
* leak in from user-space. See knote_proc_fork_track().
|
2002-10-23 13:10:23 +04:00
|
|
|
*/
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
kn->kn_sfflags &= ~NOTE_CHILD;
|
|
|
|
|
2022-02-12 18:51:28 +03:00
|
|
|
klist_insert(&p->p_klist, kn);
|
2008-04-24 22:39:20 +04:00
|
|
|
mutex_exit(p->p_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
return 0;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Filter detach method for EVFILT_PROC.
|
|
|
|
*
|
|
|
|
* The knote may be attached to a different process, which may exit,
|
|
|
|
* leaving nothing for the knote to be attached to. So when the process
|
|
|
|
* exits, the knote is marked as DETACHED and also flagged as ONESHOT so
|
|
|
|
* it will be deleted when read out. However, as part of the knote deletion,
|
|
|
|
* this routine is called, so a check is needed to avoid actually performing
|
|
|
|
* a detach, because the original process might not exist any more.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
filt_procdetach(struct knote *kn)
|
|
|
|
{
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
struct kqueue *kq = kn->kn_kq;
|
2002-10-23 13:10:23 +04:00
|
|
|
struct proc *p;
|
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
/*
|
|
|
|
* We have to synchronize with knote_proc_exit(), but we
|
|
|
|
* are forced to acquire the locks in the wrong order here
|
|
|
|
* because we can't be sure kn->kn_obj is valid unless
|
|
|
|
* KN_DETACHED is not set.
|
|
|
|
*/
|
|
|
|
again:
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
if ((kn->kn_status & KN_DETACHED) == 0) {
|
|
|
|
p = kn->kn_obj;
|
|
|
|
if (!mutex_tryenter(p->p_lock)) {
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
preempt_point();
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
kn->kn_status |= KN_DETACHED;
|
2022-02-12 18:51:28 +03:00
|
|
|
klist_remove(&p->p_klist, kn);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
mutex_exit(p->p_lock);
|
|
|
|
}
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Filter event method for EVFILT_PROC.
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
*
|
|
|
|
* Due to some of the complexities of process locking, we have special
|
|
|
|
* entry points for delivering knote submissions. filt_proc() is used
|
|
|
|
* only to check for activation from kqueue_register() and kqueue_scan().
|
2002-10-23 13:10:23 +04:00
|
|
|
*/
|
|
|
|
static int
|
|
|
|
filt_proc(struct knote *kn, long hint)
|
|
|
|
{
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
uint32_t fflags;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
/*
|
|
|
|
* Because we share the same klist with signal knotes, just
|
|
|
|
* ensure that we're not being invoked for the proc-related
|
|
|
|
* submissions.
|
|
|
|
*/
|
|
|
|
KASSERT((hint & (NOTE_EXEC | NOTE_EXIT | NOTE_FORK)) == 0);
|
2002-10-23 13:10:23 +04:00
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
fflags = kn->kn_fflags;
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
return fflags != 0;
|
|
|
|
}
|
2015-03-02 22:24:53 +03:00
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
void
|
|
|
|
knote_proc_exec(struct proc *p)
|
|
|
|
{
|
|
|
|
struct knote *kn, *tmpkn;
|
|
|
|
struct kqueue *kq;
|
|
|
|
uint32_t fflags;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
mutex_enter(p->p_lock);
|
|
|
|
|
|
|
|
SLIST_FOREACH_SAFE(kn, &p->p_klist, kn_selnext, tmpkn) {
|
|
|
|
/* N.B. EVFILT_SIGNAL knotes are on this same list. */
|
|
|
|
if (kn->kn_fop == &sig_filtops) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
KASSERT(kn->kn_fop == &proc_filtops);
|
|
|
|
|
|
|
|
kq = kn->kn_kq;
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
fflags = (kn->kn_fflags |= (kn->kn_sfflags & NOTE_EXEC));
|
|
|
|
if (fflags) {
|
2021-10-21 03:54:15 +03:00
|
|
|
knote_activate_locked(kn);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
}
|
2021-10-21 03:54:15 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(p->p_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __noinline
|
|
|
|
knote_proc_fork_track(struct proc *p1, struct proc *p2, struct knote *okn)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = okn->kn_kq;
|
|
|
|
|
|
|
|
KASSERT(mutex_owned(&kq->kq_lock));
|
|
|
|
KASSERT(mutex_owned(p1->p_lock));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We're going to put this knote into flux while we drop
|
|
|
|
* the locks and create and attach a new knote to track the
|
|
|
|
* child. If we are not able to enter flux, then this knote
|
|
|
|
* is about to go away, so skip the notification.
|
|
|
|
*/
|
|
|
|
if (!kn_enter_flux(okn)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
mutex_exit(p1->p_lock);
|
2008-03-22 00:53:35 +03:00
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
/*
|
|
|
|
* We actually have to register *two* new knotes:
|
|
|
|
*
|
|
|
|
* ==> One for the NOTE_CHILD notification. This is a forced
|
|
|
|
* ONESHOT note.
|
|
|
|
*
|
|
|
|
* ==> One to actually track the child process as it subsequently
|
|
|
|
* forks, execs, and, ultimately, exits.
|
|
|
|
*
|
|
|
|
* If we only register a single knote, then it's possible for
|
|
|
|
* for the NOTE_CHILD and NOTE_EXIT to be collapsed into a single
|
|
|
|
* notification if the child exits before the tracking process
|
|
|
|
* has received the NOTE_CHILD notification, which applications
|
|
|
|
* aren't expecting (the event's 'data' field would be clobbered,
|
2022-05-24 23:50:17 +03:00
|
|
|
* for example).
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
*
|
|
|
|
* To do this, what we have here is an **extremely** stripped-down
|
|
|
|
* version of kqueue_register() that has the following properties:
|
|
|
|
*
|
|
|
|
* ==> Does not block to allocate memory. If we are unable
|
|
|
|
* to allocate memory, we return ENOMEM.
|
|
|
|
*
|
|
|
|
* ==> Does not search for existing knotes; we know there
|
|
|
|
* are not any because this is a new process that isn't
|
|
|
|
* even visible to other processes yet.
|
|
|
|
*
|
|
|
|
* ==> Assumes that the knhash for our kq's descriptor table
|
|
|
|
* already exists (after all, we're already tracking
|
|
|
|
* processes with knotes if we got here).
|
|
|
|
*
|
|
|
|
* ==> Directly attaches the new tracking knote to the child
|
|
|
|
* process.
|
|
|
|
*
|
|
|
|
* The whole point is to do the minimum amount of work while the
|
|
|
|
* knote is held in-flux, and to avoid doing extra work in general
|
|
|
|
* (we already have the new child process; why bother looking it
|
|
|
|
* up again?).
|
|
|
|
*/
|
|
|
|
filedesc_t *fdp = kq->kq_fdp;
|
|
|
|
struct knote *knchild, *kntrack;
|
|
|
|
int error = 0;
|
|
|
|
|
2022-07-13 06:23:07 +03:00
|
|
|
knchild = knote_alloc(false);
|
|
|
|
kntrack = knote_alloc(false);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
if (__predict_false(knchild == NULL || kntrack == NULL)) {
|
|
|
|
error = ENOMEM;
|
|
|
|
goto out;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
kntrack->kn_obj = p2;
|
|
|
|
kntrack->kn_id = p2->p_pid;
|
|
|
|
kntrack->kn_kq = kq;
|
|
|
|
kntrack->kn_fop = okn->kn_fop;
|
|
|
|
kntrack->kn_kfilter = okn->kn_kfilter;
|
|
|
|
kntrack->kn_sfflags = okn->kn_sfflags;
|
|
|
|
kntrack->kn_sdata = p1->p_pid;
|
|
|
|
|
|
|
|
kntrack->kn_kevent.ident = p2->p_pid;
|
|
|
|
kntrack->kn_kevent.filter = okn->kn_filter;
|
|
|
|
kntrack->kn_kevent.flags =
|
|
|
|
okn->kn_flags | EV_ADD | EV_ENABLE | EV_CLEAR;
|
|
|
|
kntrack->kn_kevent.fflags = 0;
|
|
|
|
kntrack->kn_kevent.data = 0;
|
|
|
|
kntrack->kn_kevent.udata = okn->kn_kevent.udata; /* preserve udata */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The child note does not need to be attached to the
|
|
|
|
* new proc's klist at all.
|
|
|
|
*/
|
|
|
|
*knchild = *kntrack;
|
|
|
|
knchild->kn_status = KN_DETACHED;
|
|
|
|
knchild->kn_sfflags = 0;
|
|
|
|
knchild->kn_kevent.flags |= EV_ONESHOT;
|
|
|
|
knchild->kn_kevent.fflags = NOTE_CHILD;
|
|
|
|
knchild->kn_kevent.data = p1->p_pid; /* parent */
|
|
|
|
|
|
|
|
mutex_enter(&fdp->fd_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to check to see if the kq is closing, and skip
|
|
|
|
* attaching the knote if so. Normally, this isn't necessary
|
|
|
|
* when coming in the front door because the file descriptor
|
|
|
|
* layer will synchronize this.
|
|
|
|
*
|
|
|
|
* It's safe to test KQ_CLOSING without taking the kq_lock
|
|
|
|
* here because that flag is only ever set when the fd_lock
|
|
|
|
* is also held.
|
|
|
|
*/
|
|
|
|
if (__predict_false(kq->kq_count & KQ_CLOSING)) {
|
|
|
|
mutex_exit(&fdp->fd_lock);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We do the "insert into FD table" and "attach to klist" steps
|
|
|
|
* in the opposite order of kqueue_register() here to avoid
|
|
|
|
* having to take p2->p_lock twice. But this is OK because we
|
|
|
|
* hold fd_lock across the entire operation.
|
|
|
|
*/
|
|
|
|
|
|
|
|
mutex_enter(p2->p_lock);
|
|
|
|
error = kauth_authorize_process(curlwp->l_cred,
|
|
|
|
KAUTH_PROCESS_KEVENT_FILTER, p2, NULL, NULL, NULL);
|
|
|
|
if (__predict_false(error != 0)) {
|
|
|
|
mutex_exit(p2->p_lock);
|
|
|
|
mutex_exit(&fdp->fd_lock);
|
|
|
|
error = EACCES;
|
|
|
|
goto out;
|
|
|
|
}
|
2022-02-12 18:51:28 +03:00
|
|
|
klist_insert(&p2->p_klist, kntrack);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
mutex_exit(p2->p_lock);
|
|
|
|
|
|
|
|
KASSERT(fdp->fd_knhashmask != 0);
|
|
|
|
KASSERT(fdp->fd_knhash != NULL);
|
|
|
|
struct klist *list = &fdp->fd_knhash[KN_HASH(kntrack->kn_id,
|
|
|
|
fdp->fd_knhashmask)];
|
|
|
|
SLIST_INSERT_HEAD(list, kntrack, kn_link);
|
|
|
|
SLIST_INSERT_HEAD(list, knchild, kn_link);
|
|
|
|
|
|
|
|
/* This adds references for knchild *and* kntrack. */
|
|
|
|
atomic_add_int(&kntrack->kn_kfilter->refcnt, 2);
|
|
|
|
|
|
|
|
knote_activate(knchild);
|
|
|
|
|
|
|
|
kntrack = NULL;
|
|
|
|
knchild = NULL;
|
|
|
|
|
|
|
|
mutex_exit(&fdp->fd_lock);
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (__predict_false(knchild != NULL)) {
|
2022-07-13 06:23:07 +03:00
|
|
|
knote_free(knchild);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
}
|
|
|
|
if (__predict_false(kntrack != NULL)) {
|
2022-07-13 06:23:07 +03:00
|
|
|
knote_free(kntrack);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
}
|
|
|
|
mutex_enter(p1->p_lock);
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
|
|
|
|
if (kn_leave_flux(okn)) {
|
|
|
|
KQ_FLUX_WAKEUP(kq);
|
|
|
|
}
|
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
knote_proc_fork(struct proc *p1, struct proc *p2)
|
|
|
|
{
|
|
|
|
struct knote *kn;
|
|
|
|
struct kqueue *kq;
|
|
|
|
uint32_t fflags;
|
|
|
|
|
|
|
|
mutex_enter(p1->p_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* N.B. We DO NOT use SLIST_FOREACH_SAFE() here because we
|
|
|
|
* don't want to pre-fetch the next knote; in the event we
|
|
|
|
* have to drop p_lock, we will have put the knote in-flux,
|
|
|
|
* meaning that no one will be able to detach it until we
|
|
|
|
* have taken the knote out of flux. However, that does
|
|
|
|
* NOT stop someone else from detaching the next note in the
|
|
|
|
* list while we have it unlocked. Thus, we want to fetch
|
|
|
|
* the next note in the list only after we have re-acquired
|
|
|
|
* the lock, and using SLIST_FOREACH() will satisfy that.
|
|
|
|
*/
|
|
|
|
SLIST_FOREACH(kn, &p1->p_klist, kn_selnext) {
|
|
|
|
/* N.B. EVFILT_SIGNAL knotes are on this same list. */
|
|
|
|
if (kn->kn_fop == &sig_filtops) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
KASSERT(kn->kn_fop == &proc_filtops);
|
|
|
|
|
|
|
|
kq = kn->kn_kq;
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
kn->kn_fflags |= (kn->kn_sfflags & NOTE_FORK);
|
|
|
|
if (__predict_false(kn->kn_sfflags & NOTE_TRACK)) {
|
|
|
|
/*
|
|
|
|
* This will drop kq_lock and p_lock and
|
|
|
|
* re-acquire them before it returns.
|
|
|
|
*/
|
|
|
|
if (knote_proc_fork_track(p1, p2, kn)) {
|
|
|
|
kn->kn_fflags |= NOTE_TRACKERR;
|
|
|
|
}
|
|
|
|
KASSERT(mutex_owned(p1->p_lock));
|
|
|
|
KASSERT(mutex_owned(&kq->kq_lock));
|
|
|
|
}
|
|
|
|
fflags = kn->kn_fflags;
|
|
|
|
if (fflags) {
|
2021-10-21 03:54:15 +03:00
|
|
|
knote_activate_locked(kn);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
}
|
2021-10-21 03:54:15 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(p1->p_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
knote_proc_exit(struct proc *p)
|
|
|
|
{
|
|
|
|
struct knote *kn;
|
|
|
|
struct kqueue *kq;
|
|
|
|
|
|
|
|
KASSERT(mutex_owned(p->p_lock));
|
|
|
|
|
|
|
|
while (!SLIST_EMPTY(&p->p_klist)) {
|
|
|
|
kn = SLIST_FIRST(&p->p_klist);
|
|
|
|
kq = kn->kn_kq;
|
|
|
|
|
|
|
|
KASSERT(kn->kn_obj == p);
|
|
|
|
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
kn->kn_data = P_WAITSTATUS(p);
|
|
|
|
/*
|
|
|
|
* Mark as ONESHOT, so that the knote is g/c'ed
|
|
|
|
* when read.
|
|
|
|
*/
|
|
|
|
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
|
|
|
|
kn->kn_fflags |= kn->kn_sfflags & NOTE_EXIT;
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
/*
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
* Detach the knote from the process and mark it as such.
|
|
|
|
* N.B. EVFILT_SIGNAL are also on p_klist, but by the
|
|
|
|
* time we get here, all open file descriptors for this
|
|
|
|
* process have been released, meaning that signal knotes
|
|
|
|
* will have already been detached.
|
|
|
|
*
|
|
|
|
* We need to synchronize this with filt_procdetach().
|
2002-10-23 13:10:23 +04:00
|
|
|
*/
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
KASSERT(kn->kn_fop == &proc_filtops);
|
|
|
|
if ((kn->kn_status & KN_DETACHED) == 0) {
|
|
|
|
kn->kn_status |= KN_DETACHED;
|
|
|
|
SLIST_REMOVE_HEAD(&p->p_klist, kn_selnext);
|
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
/*
|
|
|
|
* Always activate the knote for NOTE_EXIT regardless
|
|
|
|
* of whether or not the listener cares about it.
|
|
|
|
* This matches historical behavior.
|
|
|
|
*/
|
2021-10-21 03:54:15 +03:00
|
|
|
knote_activate_locked(kn);
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
2021-10-21 03:54:15 +03:00
|
|
|
#define FILT_TIMER_NOSCHED ((uintptr_t)-1)
|
|
|
|
|
2003-02-04 12:02:04 +03:00
|
|
|
static int
|
2021-10-21 04:11:21 +03:00
|
|
|
filt_timercompute(struct kevent *kev, uintptr_t *tticksp)
|
2003-02-04 12:02:04 +03:00
|
|
|
{
|
2021-10-13 07:57:19 +03:00
|
|
|
struct timespec ts;
|
2021-10-21 04:11:21 +03:00
|
|
|
uintptr_t tticks;
|
2003-02-04 12:02:04 +03:00
|
|
|
|
2021-10-21 04:11:21 +03:00
|
|
|
if (kev->fflags & ~(NOTE_TIMER_UNITMASK | NOTE_ABSTIME)) {
|
2021-10-13 07:57:19 +03:00
|
|
|
return EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Convert the event 'data' to a timespec, then convert the
|
|
|
|
* timespec to callout ticks.
|
|
|
|
*/
|
2021-10-21 04:11:21 +03:00
|
|
|
switch (kev->fflags & NOTE_TIMER_UNITMASK) {
|
2021-10-13 07:57:19 +03:00
|
|
|
case NOTE_SECONDS:
|
2021-10-21 04:11:21 +03:00
|
|
|
ts.tv_sec = kev->data;
|
2021-10-13 07:57:19 +03:00
|
|
|
ts.tv_nsec = 0;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NOTE_MSECONDS: /* == historical value 0 */
|
2021-10-21 04:11:21 +03:00
|
|
|
ts.tv_sec = kev->data / 1000;
|
|
|
|
ts.tv_nsec = (kev->data % 1000) * 1000000;
|
2021-10-13 07:57:19 +03:00
|
|
|
break;
|
|
|
|
|
|
|
|
case NOTE_USECONDS:
|
2021-10-21 04:11:21 +03:00
|
|
|
ts.tv_sec = kev->data / 1000000;
|
|
|
|
ts.tv_nsec = (kev->data % 1000000) * 1000;
|
2021-10-13 07:57:19 +03:00
|
|
|
break;
|
|
|
|
|
|
|
|
case NOTE_NSECONDS:
|
2021-10-21 04:11:21 +03:00
|
|
|
ts.tv_sec = kev->data / 1000000000;
|
|
|
|
ts.tv_nsec = kev->data % 1000000000;
|
2021-10-13 07:57:19 +03:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
return EINVAL;
|
|
|
|
}
|
|
|
|
|
2021-10-21 04:11:21 +03:00
|
|
|
if (kev->fflags & NOTE_ABSTIME) {
|
2021-10-13 07:57:19 +03:00
|
|
|
struct timespec deadline = ts;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get current time.
|
|
|
|
*
|
|
|
|
* XXX This is CLOCK_REALTIME. There is no way to
|
|
|
|
* XXX specify CLOCK_MONOTONIC.
|
|
|
|
*/
|
|
|
|
nanotime(&ts);
|
|
|
|
|
2021-10-21 04:11:21 +03:00
|
|
|
/* Absolute timers do not repeat. */
|
|
|
|
kev->data = FILT_TIMER_NOSCHED;
|
|
|
|
|
2021-10-13 07:57:19 +03:00
|
|
|
/* If we're past the deadline, then the event will fire. */
|
|
|
|
if (timespeccmp(&deadline, &ts, <=)) {
|
2021-10-21 04:11:21 +03:00
|
|
|
tticks = FILT_TIMER_NOSCHED;
|
|
|
|
goto out;
|
2021-10-13 07:57:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Calculate how much time is left. */
|
|
|
|
timespecsub(&deadline, &ts, &ts);
|
|
|
|
} else {
|
|
|
|
/* EV_CLEAR automatically set for relative timers. */
|
2021-10-21 04:11:21 +03:00
|
|
|
kev->flags |= EV_CLEAR;
|
2021-10-13 07:57:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
tticks = tstohz(&ts);
|
2003-02-04 12:02:04 +03:00
|
|
|
|
|
|
|
/* if the supplied value is under our resolution, use 1 tick */
|
|
|
|
if (tticks == 0) {
|
2021-10-21 04:11:21 +03:00
|
|
|
if (kev->data == 0)
|
2008-03-22 00:53:35 +03:00
|
|
|
return EINVAL;
|
2003-02-04 12:02:04 +03:00
|
|
|
tticks = 1;
|
2021-10-21 04:11:21 +03:00
|
|
|
} else if (tticks > INT_MAX) {
|
|
|
|
return EINVAL;
|
2003-02-04 12:02:04 +03:00
|
|
|
}
|
|
|
|
|
2021-10-21 04:11:21 +03:00
|
|
|
if ((kev->flags & EV_ONESHOT) != 0) {
|
2021-10-13 07:57:19 +03:00
|
|
|
/* Timer does not repeat. */
|
2021-10-21 04:11:21 +03:00
|
|
|
kev->data = FILT_TIMER_NOSCHED;
|
2021-10-13 07:57:19 +03:00
|
|
|
} else {
|
2021-10-21 03:54:15 +03:00
|
|
|
KASSERT((uintptr_t)tticks != FILT_TIMER_NOSCHED);
|
2021-10-21 04:11:21 +03:00
|
|
|
kev->data = tticks;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
*tticksp = tticks;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
filt_timerexpire(void *knx)
|
|
|
|
{
|
|
|
|
struct knote *kn = knx;
|
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
kn->kn_data++;
|
|
|
|
knote_activate_locked(kn);
|
|
|
|
if (kn->kn_sdata != FILT_TIMER_NOSCHED) {
|
2023-04-09 12:18:09 +03:00
|
|
|
KASSERT(kn->kn_sdata > 0);
|
|
|
|
KASSERT(kn->kn_sdata <= INT_MAX);
|
2021-10-21 04:11:21 +03:00
|
|
|
callout_schedule((callout_t *)kn->kn_hook,
|
|
|
|
(int)kn->kn_sdata);
|
|
|
|
}
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
}
|
|
|
|
|
2021-10-22 07:49:24 +03:00
|
|
|
static inline void
|
|
|
|
filt_timerstart(struct knote *kn, uintptr_t tticks)
|
|
|
|
{
|
|
|
|
callout_t *calloutp = kn->kn_hook;
|
|
|
|
|
|
|
|
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
|
|
|
|
KASSERT(!callout_pending(calloutp));
|
|
|
|
|
|
|
|
if (__predict_false(tticks == FILT_TIMER_NOSCHED)) {
|
|
|
|
kn->kn_data = 1;
|
|
|
|
} else {
|
|
|
|
KASSERT(tticks <= INT_MAX);
|
|
|
|
callout_reset(calloutp, (int)tticks, filt_timerexpire, kn);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-10-21 04:11:21 +03:00
|
|
|
static int
|
|
|
|
filt_timerattach(struct knote *kn)
|
|
|
|
{
|
|
|
|
callout_t *calloutp;
|
|
|
|
struct kqueue *kq;
|
|
|
|
uintptr_t tticks;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
struct kevent kev = {
|
|
|
|
.flags = kn->kn_flags,
|
|
|
|
.fflags = kn->kn_sfflags,
|
|
|
|
.data = kn->kn_sdata,
|
|
|
|
};
|
|
|
|
|
|
|
|
error = filt_timercompute(&kev, &tticks);
|
|
|
|
if (error) {
|
|
|
|
return error;
|
2021-10-13 07:57:19 +03:00
|
|
|
}
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
|
|
|
|
(calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
|
|
|
|
atomic_dec_uint(&kq_ncallouts);
|
|
|
|
return ENOMEM;
|
|
|
|
}
|
2008-04-22 15:44:24 +04:00
|
|
|
callout_init(calloutp, CALLOUT_MPSAFE);
|
2008-03-22 00:53:35 +03:00
|
|
|
|
|
|
|
kq = kn->kn_kq;
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
2021-10-21 04:11:21 +03:00
|
|
|
|
|
|
|
kn->kn_sdata = kev.data;
|
|
|
|
kn->kn_flags = kev.flags;
|
|
|
|
KASSERT(kn->kn_sfflags == kev.fflags);
|
2003-02-04 12:02:04 +03:00
|
|
|
kn->kn_hook = calloutp;
|
2008-03-22 00:53:35 +03:00
|
|
|
|
2021-10-22 07:49:24 +03:00
|
|
|
filt_timerstart(kn, tticks);
|
2021-10-21 04:11:21 +03:00
|
|
|
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
2003-02-04 12:02:04 +03:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
filt_timerdetach(struct knote *kn)
|
|
|
|
{
|
2007-07-10 00:51:58 +04:00
|
|
|
callout_t *calloutp;
|
2018-01-12 20:58:51 +03:00
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
|
|
|
|
/* prevent rescheduling when we expire */
|
2021-10-21 03:54:15 +03:00
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
kn->kn_sdata = FILT_TIMER_NOSCHED;
|
2018-01-12 20:58:51 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
2003-02-04 12:02:04 +03:00
|
|
|
|
2007-07-10 00:51:58 +04:00
|
|
|
calloutp = (callout_t *)kn->kn_hook;
|
2021-09-27 02:34:46 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Attempt to stop the callout. This will block if it's
|
|
|
|
* already running.
|
|
|
|
*/
|
2008-04-22 16:04:22 +04:00
|
|
|
callout_halt(calloutp, NULL);
|
2021-09-27 02:34:46 +03:00
|
|
|
|
2007-07-10 00:51:58 +04:00
|
|
|
callout_destroy(calloutp);
|
2008-03-22 00:53:35 +03:00
|
|
|
kmem_free(calloutp, sizeof(*calloutp));
|
|
|
|
atomic_dec_uint(&kq_ncallouts);
|
2003-02-04 12:02:04 +03:00
|
|
|
}
|
|
|
|
|
2021-10-22 07:49:24 +03:00
|
|
|
static int
|
|
|
|
filt_timertouch(struct knote *kn, struct kevent *kev, long type)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
callout_t *calloutp;
|
|
|
|
uintptr_t tticks;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
KASSERT(mutex_owned(&kq->kq_lock));
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case EVENT_REGISTER:
|
|
|
|
/* Only relevant for EV_ADD. */
|
|
|
|
if ((kev->flags & EV_ADD) == 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Stop the timer, under the assumption that if
|
|
|
|
* an application is re-configuring the timer,
|
|
|
|
* they no longer care about the old one. We
|
|
|
|
* can safely drop the kq_lock while we wait
|
|
|
|
* because fdp->fd_lock will be held throughout,
|
|
|
|
* ensuring that no one can sneak in with an
|
|
|
|
* EV_DELETE or close the kq.
|
|
|
|
*/
|
|
|
|
KASSERT(mutex_owned(&kq->kq_fdp->fd_lock));
|
|
|
|
|
|
|
|
calloutp = kn->kn_hook;
|
|
|
|
callout_halt(calloutp, &kq->kq_lock);
|
|
|
|
KASSERT(mutex_owned(&kq->kq_lock));
|
|
|
|
knote_deactivate_locked(kn);
|
|
|
|
kn->kn_data = 0;
|
|
|
|
|
|
|
|
error = filt_timercompute(kev, &tticks);
|
|
|
|
if (error) {
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
kn->kn_sdata = kev->data;
|
|
|
|
kn->kn_flags = kev->flags;
|
|
|
|
kn->kn_sfflags = kev->fflags;
|
|
|
|
filt_timerstart(kn, tticks);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case EVENT_PROCESS:
|
|
|
|
*kev = kn->kn_kevent;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
panic("%s: invalid type (%ld)", __func__, type);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2003-02-04 12:02:04 +03:00
|
|
|
static int
|
2006-11-01 13:17:58 +03:00
|
|
|
filt_timer(struct knote *kn, long hint)
|
2003-02-04 12:02:04 +03:00
|
|
|
{
|
2021-10-21 03:54:15 +03:00
|
|
|
struct kqueue *kq = kn->kn_kq;
|
2008-03-22 00:53:35 +03:00
|
|
|
int rv;
|
|
|
|
|
2021-10-21 03:54:15 +03:00
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
2008-03-22 00:53:35 +03:00
|
|
|
rv = (kn->kn_data != 0);
|
2021-10-21 03:54:15 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
2008-03-22 00:53:35 +03:00
|
|
|
|
2018-01-09 06:31:12 +03:00
|
|
|
return rv;
|
|
|
|
}
|
|
|
|
|
2020-10-31 04:08:31 +03:00
|
|
|
static int
|
|
|
|
filt_userattach(struct knote *kn)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* EVFILT_USER knotes are not attached to anything in the kernel.
|
|
|
|
*/
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
kn->kn_hook = NULL;
|
|
|
|
if (kn->kn_fflags & NOTE_TRIGGER)
|
|
|
|
kn->kn_hookid = 1;
|
|
|
|
else
|
|
|
|
kn->kn_hookid = 0;
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
filt_userdetach(struct knote *kn)
|
|
|
|
{
|
|
|
|
|
|
|
|
/*
|
|
|
|
* EVFILT_USER knotes are not attached to anything in the kernel.
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
filt_user(struct knote *kn, long hint)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
int hookid;
|
|
|
|
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
hookid = kn->kn_hookid;
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
|
|
|
|
return hookid;
|
|
|
|
}
|
|
|
|
|
2021-10-21 05:34:03 +03:00
|
|
|
static int
|
2020-10-31 04:08:31 +03:00
|
|
|
filt_usertouch(struct knote *kn, struct kevent *kev, long type)
|
|
|
|
{
|
|
|
|
int ffctrl;
|
|
|
|
|
2021-01-27 09:59:08 +03:00
|
|
|
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
|
2021-01-26 22:09:18 +03:00
|
|
|
|
2020-10-31 04:08:31 +03:00
|
|
|
switch (type) {
|
|
|
|
case EVENT_REGISTER:
|
|
|
|
if (kev->fflags & NOTE_TRIGGER)
|
|
|
|
kn->kn_hookid = 1;
|
|
|
|
|
|
|
|
ffctrl = kev->fflags & NOTE_FFCTRLMASK;
|
|
|
|
kev->fflags &= NOTE_FFLAGSMASK;
|
|
|
|
switch (ffctrl) {
|
|
|
|
case NOTE_FFNOP:
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NOTE_FFAND:
|
|
|
|
kn->kn_sfflags &= kev->fflags;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NOTE_FFOR:
|
|
|
|
kn->kn_sfflags |= kev->fflags;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case NOTE_FFCOPY:
|
|
|
|
kn->kn_sfflags = kev->fflags;
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
/* XXX Return error? */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
kn->kn_sdata = kev->data;
|
|
|
|
if (kev->flags & EV_CLEAR) {
|
|
|
|
kn->kn_hookid = 0;
|
|
|
|
kn->kn_data = 0;
|
|
|
|
kn->kn_fflags = 0;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case EVENT_PROCESS:
|
|
|
|
*kev = kn->kn_kevent;
|
|
|
|
kev->fflags = kn->kn_sfflags;
|
|
|
|
kev->data = kn->kn_sdata;
|
|
|
|
if (kn->kn_flags & EV_CLEAR) {
|
|
|
|
kn->kn_hookid = 0;
|
|
|
|
kn->kn_data = 0;
|
|
|
|
kn->kn_fflags = 0;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
panic("filt_usertouch() - invalid type (%ld)", type);
|
|
|
|
break;
|
|
|
|
}
|
2021-10-21 05:34:03 +03:00
|
|
|
|
|
|
|
return 0;
|
2020-10-31 04:08:31 +03:00
|
|
|
}
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
/*
|
|
|
|
* filt_seltrue:
|
|
|
|
*
|
|
|
|
* This filter "event" routine simulates seltrue().
|
|
|
|
*/
|
|
|
|
int
|
2006-11-01 13:17:58 +03:00
|
|
|
filt_seltrue(struct knote *kn, long hint)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't know how much data can be read/written,
|
|
|
|
* but we know that it *can* be. This is about as
|
|
|
|
* good as select/poll does as well.
|
|
|
|
*/
|
|
|
|
kn->kn_data = 0;
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This provides full kqfilter entry for device switch tables, which
|
|
|
|
* has same effect as filter using filt_seltrue() as filter method.
|
|
|
|
*/
|
|
|
|
static void
|
2006-11-01 13:17:58 +03:00
|
|
|
filt_seltruedetach(struct knote *kn)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
|
|
|
/* Nothing to do */
|
|
|
|
}
|
|
|
|
|
2017-10-25 11:12:37 +03:00
|
|
|
const struct filterops seltrue_filtops = {
|
2021-09-26 21:13:58 +03:00
|
|
|
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
|
2017-10-25 11:12:37 +03:00
|
|
|
.f_attach = NULL,
|
|
|
|
.f_detach = filt_seltruedetach,
|
|
|
|
.f_event = filt_seltrue,
|
|
|
|
};
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
int
|
2006-11-01 13:17:58 +03:00
|
|
|
seltrue_kqfilter(dev_t dev, struct knote *kn)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
|
|
|
switch (kn->kn_filter) {
|
|
|
|
case EVFILT_READ:
|
|
|
|
case EVFILT_WRITE:
|
|
|
|
kn->kn_fop = &seltrue_filtops;
|
|
|
|
break;
|
|
|
|
default:
|
2007-12-05 20:19:46 +03:00
|
|
|
return (EINVAL);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Nothing more to do */
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* kqueue(2) system call.
|
|
|
|
*/
|
2011-06-26 20:42:39 +04:00
|
|
|
static int
|
|
|
|
kqueue1(struct lwp *l, int flags, register_t *retval)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
struct kqueue *kq;
|
|
|
|
file_t *fp;
|
|
|
|
int fd, error;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
if ((error = fd_allocfile(&fp, &fd)) != 0)
|
|
|
|
return error;
|
2012-01-25 04:28:35 +04:00
|
|
|
fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
|
2002-10-23 13:10:23 +04:00
|
|
|
fp->f_type = DTYPE_KQUEUE;
|
|
|
|
fp->f_ops = &kqueueops;
|
2008-03-22 00:53:35 +03:00
|
|
|
kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
|
|
|
|
mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
|
|
|
|
cv_init(&kq->kq_cv, "kqueue");
|
2008-03-01 17:16:49 +03:00
|
|
|
selinit(&kq->kq_sel);
|
2008-03-22 00:53:35 +03:00
|
|
|
TAILQ_INIT(&kq->kq_head);
|
2014-09-05 13:20:59 +04:00
|
|
|
fp->f_kqueue = kq;
|
2002-10-23 13:10:23 +04:00
|
|
|
*retval = fd;
|
2008-03-22 00:53:35 +03:00
|
|
|
kq->kq_fdp = curlwp->l_fd;
|
2011-06-26 20:42:39 +04:00
|
|
|
fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
|
2008-03-22 00:53:35 +03:00
|
|
|
fd_affix(curproc, fp, fd);
|
|
|
|
return error;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
2011-06-26 20:42:39 +04:00
|
|
|
/*
|
|
|
|
* kqueue(2) system call.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
sys_kqueue(struct lwp *l, const void *v, register_t *retval)
|
|
|
|
{
|
|
|
|
return kqueue1(l, 0, retval);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
|
|
|
|
register_t *retval)
|
|
|
|
{
|
|
|
|
/* {
|
|
|
|
syscallarg(int) flags;
|
|
|
|
} */
|
|
|
|
return kqueue1(l, SCARG(uap, flags), retval);
|
|
|
|
}
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
/*
|
|
|
|
* kevent(2) system call.
|
|
|
|
*/
|
2009-01-11 05:45:45 +03:00
|
|
|
int
|
2014-09-05 09:57:21 +04:00
|
|
|
kevent_fetch_changes(void *ctx, const struct kevent *changelist,
|
2009-01-11 05:45:45 +03:00
|
|
|
struct kevent *changes, size_t index, int n)
|
2005-10-23 05:33:32 +04:00
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
|
2005-10-23 05:33:32 +04:00
|
|
|
return copyin(changelist + index, changes, n * sizeof(*changes));
|
|
|
|
}
|
|
|
|
|
2009-01-11 05:45:45 +03:00
|
|
|
int
|
2014-09-05 09:57:21 +04:00
|
|
|
kevent_put_events(void *ctx, struct kevent *events,
|
2009-01-11 05:45:45 +03:00
|
|
|
struct kevent *eventlist, size_t index, int n)
|
2005-10-23 05:33:32 +04:00
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
|
2005-10-23 05:33:32 +04:00
|
|
|
return copyout(events, eventlist + index, n * sizeof(*events));
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct kevent_ops kevent_native_ops = {
|
2008-06-24 14:27:35 +04:00
|
|
|
.keo_private = NULL,
|
|
|
|
.keo_fetch_timeout = copyin,
|
|
|
|
.keo_fetch_changes = kevent_fetch_changes,
|
|
|
|
.keo_put_events = kevent_put_events,
|
2005-10-23 05:33:32 +04:00
|
|
|
};
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
int
|
2023-07-28 21:18:59 +03:00
|
|
|
sys___kevent100(struct lwp *l, const struct sys___kevent100_args *uap,
|
2009-01-11 05:45:45 +03:00
|
|
|
register_t *retval)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
2007-12-21 02:02:38 +03:00
|
|
|
/* {
|
2002-10-23 13:10:23 +04:00
|
|
|
syscallarg(int) fd;
|
|
|
|
syscallarg(const struct kevent *) changelist;
|
|
|
|
syscallarg(size_t) nchanges;
|
|
|
|
syscallarg(struct kevent *) eventlist;
|
|
|
|
syscallarg(size_t) nevents;
|
|
|
|
syscallarg(const struct timespec *) timeout;
|
2007-12-21 02:02:38 +03:00
|
|
|
} */
|
2005-10-23 05:33:32 +04:00
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
|
2005-10-23 05:33:32 +04:00
|
|
|
SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
|
|
|
|
SCARG(uap, timeout), &kevent_native_ops);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
2008-03-22 00:53:35 +03:00
|
|
|
kevent1(register_t *retval, int fd,
|
|
|
|
const struct kevent *changelist, size_t nchanges,
|
|
|
|
struct kevent *eventlist, size_t nevents,
|
|
|
|
const struct timespec *timeout,
|
|
|
|
const struct kevent_ops *keops)
|
2005-10-23 05:33:32 +04:00
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
struct kevent *kevp;
|
|
|
|
struct kqueue *kq;
|
2002-10-23 13:10:23 +04:00
|
|
|
struct timespec ts;
|
2008-03-22 00:53:35 +03:00
|
|
|
size_t i, n, ichange;
|
|
|
|
int nerrors, error;
|
2014-06-24 18:42:43 +04:00
|
|
|
struct kevent kevbuf[KQ_NEVENTS]; /* approx 300 bytes on 64-bit */
|
2008-03-22 00:53:35 +03:00
|
|
|
file_t *fp;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
/* check that we're dealing with a kq */
|
2008-03-22 00:53:35 +03:00
|
|
|
fp = fd_getfile(fd);
|
2003-02-23 17:37:32 +03:00
|
|
|
if (fp == NULL)
|
2002-10-23 13:10:23 +04:00
|
|
|
return (EBADF);
|
|
|
|
|
2003-02-23 17:37:32 +03:00
|
|
|
if (fp->f_type != DTYPE_KQUEUE) {
|
2008-03-22 00:53:35 +03:00
|
|
|
fd_putfile(fd);
|
2003-02-23 17:37:32 +03:00
|
|
|
return (EBADF);
|
|
|
|
}
|
|
|
|
|
2005-10-23 05:33:32 +04:00
|
|
|
if (timeout != NULL) {
|
|
|
|
error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
|
2002-10-23 13:10:23 +04:00
|
|
|
if (error)
|
|
|
|
goto done;
|
2005-10-23 05:33:32 +04:00
|
|
|
timeout = &ts;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
2014-09-05 13:20:59 +04:00
|
|
|
kq = fp->f_kqueue;
|
2002-10-23 13:10:23 +04:00
|
|
|
nerrors = 0;
|
2005-10-23 05:33:32 +04:00
|
|
|
ichange = 0;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
/* traverse list of events to register */
|
2005-10-23 05:33:32 +04:00
|
|
|
while (nchanges > 0) {
|
2008-03-22 00:53:35 +03:00
|
|
|
n = MIN(nchanges, __arraycount(kevbuf));
|
2005-10-23 05:33:32 +04:00
|
|
|
error = (*keops->keo_fetch_changes)(keops->keo_private,
|
2008-03-22 00:53:35 +03:00
|
|
|
changelist, kevbuf, ichange, n);
|
2002-10-23 13:10:23 +04:00
|
|
|
if (error)
|
|
|
|
goto done;
|
|
|
|
for (i = 0; i < n; i++) {
|
2008-03-22 00:53:35 +03:00
|
|
|
kevp = &kevbuf[i];
|
2002-10-23 13:10:23 +04:00
|
|
|
kevp->flags &= ~EV_SYSFLAGS;
|
|
|
|
/* register each knote */
|
2008-03-22 00:53:35 +03:00
|
|
|
error = kqueue_register(kq, kevp);
|
2017-04-27 19:52:22 +03:00
|
|
|
if (!error && !(kevp->flags & EV_RECEIPT))
|
|
|
|
continue;
|
|
|
|
if (nevents == 0)
|
|
|
|
goto done;
|
|
|
|
kevp->flags = EV_ERROR;
|
|
|
|
kevp->data = error;
|
|
|
|
error = (*keops->keo_put_events)
|
|
|
|
(keops->keo_private, kevp,
|
|
|
|
eventlist, nerrors, 1);
|
|
|
|
if (error)
|
|
|
|
goto done;
|
|
|
|
nevents--;
|
|
|
|
nerrors++;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2005-10-23 05:33:32 +04:00
|
|
|
nchanges -= n; /* update the results */
|
|
|
|
ichange += n;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
if (nerrors) {
|
|
|
|
*retval = nerrors;
|
|
|
|
error = 0;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* actually scan through the events */
|
2008-03-22 00:53:35 +03:00
|
|
|
error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
|
|
|
|
kevbuf, __arraycount(kevbuf));
|
2002-10-23 13:10:23 +04:00
|
|
|
done:
|
2008-03-22 00:53:35 +03:00
|
|
|
fd_putfile(fd);
|
2002-10-23 13:10:23 +04:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Register a given kevent kev onto the kqueue
|
|
|
|
*/
|
2008-03-22 00:53:35 +03:00
|
|
|
static int
|
|
|
|
kqueue_register(struct kqueue *kq, struct kevent *kev)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
struct kfilter *kfilter;
|
|
|
|
filedesc_t *fdp;
|
|
|
|
file_t *fp;
|
|
|
|
fdfile_t *ff;
|
|
|
|
struct knote *kn, *newkn;
|
|
|
|
struct klist *list;
|
|
|
|
int error, fd, rv;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
fdp = kq->kq_fdp;
|
|
|
|
fp = NULL;
|
|
|
|
kn = NULL;
|
|
|
|
error = 0;
|
2008-03-22 00:53:35 +03:00
|
|
|
fd = 0;
|
|
|
|
|
2022-07-13 06:23:07 +03:00
|
|
|
newkn = knote_alloc(true);
|
2008-03-22 00:53:35 +03:00
|
|
|
|
|
|
|
rw_enter(&kqueue_filter_lock, RW_READER);
|
2002-10-23 13:10:23 +04:00
|
|
|
kfilter = kfilter_byfilter(kev->filter);
|
|
|
|
if (kfilter == NULL || kfilter->filtops == NULL) {
|
|
|
|
/* filter not found nor implemented */
|
2008-03-22 00:53:35 +03:00
|
|
|
rw_exit(&kqueue_filter_lock);
|
2022-07-13 06:23:07 +03:00
|
|
|
knote_free(newkn);
|
2002-10-23 13:10:23 +04:00
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* search if knote already exists */
|
2021-09-26 04:16:07 +03:00
|
|
|
if (kfilter->filtops->f_flags & FILTEROP_ISFD) {
|
2002-10-23 13:10:23 +04:00
|
|
|
/* monitoring a file descriptor */
|
2016-07-14 09:22:17 +03:00
|
|
|
/* validate descriptor */
|
2016-07-14 21:16:51 +03:00
|
|
|
if (kev->ident > INT_MAX
|
|
|
|
|| (fp = fd_getfile(fd = kev->ident)) == NULL) {
|
2008-03-22 00:53:35 +03:00
|
|
|
rw_exit(&kqueue_filter_lock);
|
2022-07-13 06:23:07 +03:00
|
|
|
knote_free(newkn);
|
2008-03-22 00:53:35 +03:00
|
|
|
return EBADF;
|
|
|
|
}
|
2011-11-18 02:41:55 +04:00
|
|
|
mutex_enter(&fdp->fd_lock);
|
2009-05-25 01:41:25 +04:00
|
|
|
ff = fdp->fd_dt->dt_ff[fd];
|
2017-11-11 06:58:01 +03:00
|
|
|
if (ff->ff_refcnt & FR_CLOSING) {
|
|
|
|
error = EBADF;
|
|
|
|
goto doneunlock;
|
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
if (fd <= fdp->fd_lastkqfile) {
|
|
|
|
SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
|
2002-10-23 13:10:23 +04:00
|
|
|
if (kq == kn->kn_kq &&
|
|
|
|
kev->filter == kn->kn_filter)
|
|
|
|
break;
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* not monitoring a file descriptor, so
|
|
|
|
* lookup knotes in internal hash table
|
|
|
|
*/
|
2011-11-18 02:41:55 +04:00
|
|
|
mutex_enter(&fdp->fd_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
if (fdp->fd_knhashmask != 0) {
|
|
|
|
list = &fdp->fd_knhash[
|
|
|
|
KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
|
2008-03-22 00:53:35 +03:00
|
|
|
SLIST_FOREACH(kn, list, kn_link) {
|
2002-10-23 13:10:23 +04:00
|
|
|
if (kev->ident == kn->kn_id &&
|
|
|
|
kq == kn->kn_kq &&
|
|
|
|
kev->filter == kn->kn_filter)
|
|
|
|
break;
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
/* It's safe to test KQ_CLOSING while holding only the fd_lock. */
|
|
|
|
KASSERT(mutex_owned(&fdp->fd_lock));
|
|
|
|
KASSERT((kq->kq_count & KQ_CLOSING) == 0);
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
/*
|
|
|
|
* kn now contains the matching knote, or NULL if no match
|
|
|
|
*/
|
2020-10-31 04:08:31 +03:00
|
|
|
if (kn == NULL) {
|
|
|
|
if (kev->flags & EV_ADD) {
|
2002-10-23 13:10:23 +04:00
|
|
|
/* create new knote */
|
2008-03-22 00:53:35 +03:00
|
|
|
kn = newkn;
|
|
|
|
newkn = NULL;
|
|
|
|
kn->kn_obj = fp;
|
2012-11-24 19:14:32 +04:00
|
|
|
kn->kn_id = kev->ident;
|
2002-10-23 13:10:23 +04:00
|
|
|
kn->kn_kq = kq;
|
|
|
|
kn->kn_fop = kfilter->filtops;
|
2008-03-22 00:53:35 +03:00
|
|
|
kn->kn_kfilter = kfilter;
|
|
|
|
kn->kn_sfflags = kev->fflags;
|
|
|
|
kn->kn_sdata = kev->data;
|
|
|
|
kev->fflags = 0;
|
|
|
|
kev->data = 0;
|
|
|
|
kn->kn_kevent = *kev;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2016-01-31 07:40:01 +03:00
|
|
|
KASSERT(kn->kn_fop != NULL);
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
/*
|
|
|
|
* XXX Allow only known-safe users of f_touch.
|
|
|
|
* XXX See filter_touch() for details.
|
|
|
|
*/
|
|
|
|
if (kn->kn_fop->f_touch != NULL &&
|
|
|
|
kn->kn_fop != &timer_filtops &&
|
|
|
|
kn->kn_fop != &user_filtops) {
|
|
|
|
error = ENOTSUP;
|
|
|
|
goto fail_ev_add;
|
|
|
|
}
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
/*
|
|
|
|
* apply reference count to knote structure, and
|
|
|
|
* do not release it at the end of this routine.
|
|
|
|
*/
|
|
|
|
fp = NULL;
|
|
|
|
|
2021-09-26 04:16:07 +03:00
|
|
|
if (!(kn->kn_fop->f_flags & FILTEROP_ISFD)) {
|
2008-03-22 00:53:35 +03:00
|
|
|
/*
|
|
|
|
* If knote is not on an fd, store on
|
|
|
|
* internal hash table.
|
|
|
|
*/
|
|
|
|
if (fdp->fd_knhashmask == 0) {
|
|
|
|
/* XXXAD can block with fd_lock held */
|
|
|
|
fdp->fd_knhash = hashinit(KN_HASHSIZE,
|
2008-05-05 21:11:16 +04:00
|
|
|
HASH_LIST, true,
|
2008-03-22 00:53:35 +03:00
|
|
|
&fdp->fd_knhashmask);
|
|
|
|
}
|
|
|
|
list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
|
|
|
|
fdp->fd_knhashmask)];
|
|
|
|
} else {
|
|
|
|
/* Otherwise, knote is on an fd. */
|
|
|
|
list = (struct klist *)
|
2009-05-25 01:41:25 +04:00
|
|
|
&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
|
2008-03-22 00:53:35 +03:00
|
|
|
if ((int)kn->kn_id > fdp->fd_lastkqfile)
|
|
|
|
fdp->fd_lastkqfile = kn->kn_id;
|
|
|
|
}
|
|
|
|
SLIST_INSERT_HEAD(list, kn, kn_link);
|
|
|
|
|
2021-09-26 06:12:50 +03:00
|
|
|
/*
|
|
|
|
* N.B. kn->kn_fop may change as the result
|
|
|
|
* of filter_attach()!
|
|
|
|
*/
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
knote_foplock_enter(kn);
|
2021-09-26 06:12:50 +03:00
|
|
|
error = filter_attach(kn);
|
2008-03-22 00:53:35 +03:00
|
|
|
if (error != 0) {
|
2017-11-30 17:19:27 +03:00
|
|
|
#ifdef DEBUG
|
2019-10-18 22:43:49 +03:00
|
|
|
struct proc *p = curlwp->l_proc;
|
2017-11-30 23:25:54 +03:00
|
|
|
const file_t *ft = kn->kn_obj;
|
2019-10-18 22:43:49 +03:00
|
|
|
printf("%s: %s[%d]: event type %d not "
|
|
|
|
"supported for file type %d/%s "
|
|
|
|
"(error %d)\n", __func__,
|
|
|
|
p->p_comm, p->p_pid,
|
2017-11-30 23:25:54 +03:00
|
|
|
kn->kn_filter, ft ? ft->f_type : -1,
|
|
|
|
ft ? ft->f_ops->fo_name : "?", error);
|
2017-11-30 17:19:27 +03:00
|
|
|
#endif
|
|
|
|
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
fail_ev_add:
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
/*
|
|
|
|
* N.B. no need to check for this note to
|
|
|
|
* be in-flux, since it was never visible
|
|
|
|
* to the monitored object.
|
|
|
|
*
|
|
|
|
* knote_detach() drops fdp->fd_lock
|
|
|
|
*/
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
knote_foplock_exit(kn);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
mutex_enter(&kq->kq_lock);
|
|
|
|
KNOTE_WILLDETACH(kn);
|
|
|
|
KASSERT(kn_in_flux(kn) == false);
|
|
|
|
mutex_exit(&kq->kq_lock);
|
2008-03-22 00:53:35 +03:00
|
|
|
knote_detach(kn, fdp, false);
|
2002-10-23 13:10:23 +04:00
|
|
|
goto done;
|
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
atomic_inc_uint(&kfilter->refcnt);
|
2020-10-31 04:08:31 +03:00
|
|
|
goto done_ev_add;
|
2002-10-23 13:10:23 +04:00
|
|
|
} else {
|
2020-10-31 04:08:31 +03:00
|
|
|
/* No matching knote and the EV_ADD flag is not set. */
|
|
|
|
error = ENOENT;
|
|
|
|
goto doneunlock;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2020-10-31 04:08:31 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (kev->flags & EV_DELETE) {
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
/*
|
|
|
|
* Let the world know that this knote is about to go
|
|
|
|
* away, and wait for it to settle if it's currently
|
|
|
|
* in-flux.
|
|
|
|
*/
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
if (kn->kn_status & KN_WILLDETACH) {
|
|
|
|
/*
|
|
|
|
* This knote is already on its way out,
|
|
|
|
* so just be done.
|
|
|
|
*/
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
goto doneunlock;
|
|
|
|
}
|
|
|
|
KNOTE_WILLDETACH(kn);
|
|
|
|
if (kn_in_flux(kn)) {
|
|
|
|
mutex_exit(&fdp->fd_lock);
|
|
|
|
/*
|
|
|
|
* It's safe for us to conclusively wait for
|
|
|
|
* this knote to settle because we know we'll
|
|
|
|
* be completing the detach.
|
|
|
|
*/
|
|
|
|
kn_wait_flux(kn, true);
|
|
|
|
KASSERT(kn_in_flux(kn) == false);
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
mutex_enter(&fdp->fd_lock);
|
|
|
|
} else {
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
}
|
|
|
|
|
2020-10-31 04:08:31 +03:00
|
|
|
/* knote_detach() drops fdp->fd_lock */
|
|
|
|
knote_detach(kn, fdp, true);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The user may change some filter values after the
|
|
|
|
* initial EV_ADD, but doing so will not reset any
|
|
|
|
* filter which have already been triggered.
|
|
|
|
*/
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
knote_foplock_enter(kn);
|
2020-10-31 04:08:31 +03:00
|
|
|
kn->kn_kevent.udata = kev->udata;
|
|
|
|
KASSERT(kn->kn_fop != NULL);
|
2021-09-26 04:16:07 +03:00
|
|
|
if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
|
|
|
|
kn->kn_fop->f_touch != NULL) {
|
2021-01-26 22:09:18 +03:00
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
2021-10-21 05:34:03 +03:00
|
|
|
error = filter_touch(kn, kev, EVENT_REGISTER);
|
2021-01-26 22:09:18 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
2021-10-21 05:34:03 +03:00
|
|
|
if (__predict_false(error != 0)) {
|
|
|
|
/* Never a new knote (which would consume newkn). */
|
|
|
|
KASSERT(newkn != NULL);
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
knote_foplock_exit(kn);
|
2021-10-21 05:34:03 +03:00
|
|
|
goto doneunlock;
|
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
} else {
|
2020-10-31 04:08:31 +03:00
|
|
|
kn->kn_sfflags = kev->fflags;
|
|
|
|
kn->kn_sdata = kev->data;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
2020-10-31 04:08:31 +03:00
|
|
|
/*
|
|
|
|
* We can get here if we are trying to attach
|
|
|
|
* an event to a file descriptor that does not
|
|
|
|
* support events, and the attach routine is
|
|
|
|
* broken and does not return an error.
|
|
|
|
*/
|
2021-10-21 05:34:03 +03:00
|
|
|
done_ev_add:
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
rv = filter_event(kn, 0, false);
|
2020-10-31 04:08:31 +03:00
|
|
|
if (rv)
|
|
|
|
knote_activate(kn);
|
|
|
|
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
knote_foplock_exit(kn);
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
/* disable knote */
|
2008-03-22 00:53:35 +03:00
|
|
|
if ((kev->flags & EV_DISABLE)) {
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
if ((kn->kn_status & KN_DISABLED) == 0)
|
|
|
|
kn->kn_status |= KN_DISABLED;
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* enable knote */
|
2008-03-22 00:53:35 +03:00
|
|
|
if ((kev->flags & EV_ENABLE)) {
|
|
|
|
knote_enqueue(kn);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2021-10-21 05:34:03 +03:00
|
|
|
doneunlock:
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_exit(&fdp->fd_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
done:
|
2008-03-22 00:53:35 +03:00
|
|
|
rw_exit(&kqueue_filter_lock);
|
|
|
|
if (newkn != NULL)
|
2022-07-13 06:23:07 +03:00
|
|
|
knote_free(newkn);
|
2002-10-23 13:10:23 +04:00
|
|
|
if (fp != NULL)
|
2008-03-22 00:53:35 +03:00
|
|
|
fd_putfile(fd);
|
2002-10-23 13:10:23 +04:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2017-09-17 02:55:16 +03:00
|
|
|
#define KN_FMT(buf, kn) \
|
|
|
|
(snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf)
|
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
#if defined(DDB)
|
|
|
|
void
|
|
|
|
kqueue_printit(struct kqueue *kq, bool full, void (*pr)(const char *, ...))
|
|
|
|
{
|
|
|
|
const struct knote *kn;
|
|
|
|
u_int count;
|
|
|
|
int nmarker;
|
|
|
|
char buf[128];
|
|
|
|
|
|
|
|
count = 0;
|
|
|
|
nmarker = 0;
|
|
|
|
|
|
|
|
(*pr)("kqueue %p (restart=%d count=%u):\n", kq,
|
|
|
|
!!(kq->kq_count & KQ_RESTART), KQ_COUNT(kq));
|
|
|
|
(*pr)(" Queued knotes:\n");
|
|
|
|
TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
|
|
|
|
if (kn->kn_status & KN_MARKER) {
|
|
|
|
nmarker++;
|
|
|
|
} else {
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
(*pr)(" knote %p: kq=%p status=%s\n",
|
|
|
|
kn, kn->kn_kq, KN_FMT(buf, kn));
|
|
|
|
(*pr)(" id=0x%lx (%lu) filter=%d\n",
|
|
|
|
(u_long)kn->kn_id, (u_long)kn->kn_id, kn->kn_filter);
|
|
|
|
if (kn->kn_kq != kq) {
|
|
|
|
(*pr)(" !!! kn->kn_kq != kq\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (count != KQ_COUNT(kq)) {
|
|
|
|
(*pr)(" !!! count(%u) != KQ_COUNT(%u)\n",
|
|
|
|
count, KQ_COUNT(kq));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif /* DDB */
|
|
|
|
|
|
|
|
#if defined(DEBUG)
|
2008-03-24 12:09:55 +03:00
|
|
|
static void
|
2017-09-17 02:55:16 +03:00
|
|
|
kqueue_check(const char *func, size_t line, const struct kqueue *kq)
|
2008-03-24 12:09:55 +03:00
|
|
|
{
|
|
|
|
const struct knote *kn;
|
2021-05-02 22:13:43 +03:00
|
|
|
u_int count;
|
2008-03-24 12:09:55 +03:00
|
|
|
int nmarker;
|
2017-09-17 02:55:16 +03:00
|
|
|
char buf[128];
|
2008-03-24 12:09:55 +03:00
|
|
|
|
|
|
|
KASSERT(mutex_owned(&kq->kq_lock));
|
|
|
|
|
|
|
|
count = 0;
|
|
|
|
nmarker = 0;
|
|
|
|
TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
|
|
|
|
if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
|
2017-09-17 02:55:16 +03:00
|
|
|
panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s",
|
|
|
|
func, line, kq, kn, KN_FMT(buf, kn));
|
2008-03-24 12:09:55 +03:00
|
|
|
}
|
|
|
|
if ((kn->kn_status & KN_MARKER) == 0) {
|
|
|
|
if (kn->kn_kq != kq) {
|
2017-09-17 02:55:16 +03:00
|
|
|
panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s",
|
|
|
|
func, line, kq, kn, kn->kn_kq,
|
|
|
|
KN_FMT(buf, kn));
|
2008-03-24 12:09:55 +03:00
|
|
|
}
|
|
|
|
if ((kn->kn_status & KN_ACTIVE) == 0) {
|
2017-09-17 02:55:16 +03:00
|
|
|
panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s",
|
|
|
|
func, line, kq, kn, KN_FMT(buf, kn));
|
2008-03-24 12:09:55 +03:00
|
|
|
}
|
|
|
|
count++;
|
2021-05-02 22:13:43 +03:00
|
|
|
if (count > KQ_COUNT(kq)) {
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
panic("%s,%zu: kq=%p kq->kq_count(%u) != "
|
2021-01-21 21:09:23 +03:00
|
|
|
"count(%d), nmarker=%d",
|
2021-05-02 22:13:43 +03:00
|
|
|
func, line, kq, KQ_COUNT(kq), count,
|
2021-01-21 21:09:23 +03:00
|
|
|
nmarker);
|
2008-03-24 12:09:55 +03:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
nmarker++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2017-09-17 02:55:16 +03:00
|
|
|
#define kq_check(a) kqueue_check(__func__, __LINE__, (a))
|
2008-03-24 12:09:55 +03:00
|
|
|
#else /* defined(DEBUG) */
|
|
|
|
#define kq_check(a) /* nothing */
|
|
|
|
#endif /* defined(DEBUG) */
|
|
|
|
|
2021-05-02 22:13:43 +03:00
|
|
|
static void
|
|
|
|
kqueue_restart(file_t *fp)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = fp->f_kqueue;
|
|
|
|
KASSERT(kq != NULL);
|
|
|
|
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
kq->kq_count |= KQ_RESTART;
|
|
|
|
cv_broadcast(&kq->kq_cv);
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
}
|
|
|
|
|
2023-04-22 16:52:54 +03:00
|
|
|
static int
|
|
|
|
kqueue_fpathconf(struct file *fp, int name, register_t *retval)
|
|
|
|
{
|
|
|
|
|
|
|
|
return EINVAL;
|
|
|
|
}
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
/*
|
|
|
|
* Scan through the list of events on fp (for a maximum of maxevents),
|
|
|
|
* returning the results in to ulistp. Timeout is determined by tsp; if
|
|
|
|
* NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
|
|
|
|
* as appropriate.
|
|
|
|
*/
|
|
|
|
static int
|
2008-03-22 00:53:35 +03:00
|
|
|
kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
|
|
|
|
const struct timespec *tsp, register_t *retval,
|
|
|
|
const struct kevent_ops *keops, struct kevent *kevbuf,
|
|
|
|
size_t kevcnt)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
|
|
|
struct kqueue *kq;
|
|
|
|
struct kevent *kevp;
|
2009-03-29 23:21:19 +04:00
|
|
|
struct timespec ats, sleepts;
|
2022-07-19 03:46:00 +03:00
|
|
|
struct knote *kn, *marker;
|
|
|
|
struct knote_impl morker;
|
2005-10-23 05:33:32 +04:00
|
|
|
size_t count, nkev, nevents;
|
2021-01-21 00:39:09 +03:00
|
|
|
int timeout, error, touch, rv, influx;
|
2008-03-22 00:53:35 +03:00
|
|
|
filedesc_t *fdp;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
fdp = curlwp->l_fd;
|
2014-09-05 13:20:59 +04:00
|
|
|
kq = fp->f_kqueue;
|
2002-10-23 13:10:23 +04:00
|
|
|
count = maxevents;
|
2005-10-23 05:33:32 +04:00
|
|
|
nkev = nevents = error = 0;
|
2008-03-22 00:53:35 +03:00
|
|
|
if (count == 0) {
|
|
|
|
*retval = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2003-02-21 23:57:09 +03:00
|
|
|
if (tsp) { /* timeout supplied */
|
2009-03-31 02:22:44 +04:00
|
|
|
ats = *tsp;
|
2009-03-29 23:21:19 +04:00
|
|
|
if (inittimeleft(&ats, &sleepts) == -1) {
|
2008-03-22 00:53:35 +03:00
|
|
|
*retval = maxevents;
|
|
|
|
return EINVAL;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2009-03-29 23:21:19 +04:00
|
|
|
timeout = tstohz(&ats);
|
2003-02-21 23:57:09 +03:00
|
|
|
if (timeout <= 0)
|
2006-07-15 02:35:15 +04:00
|
|
|
timeout = -1; /* do poll */
|
2002-10-23 13:10:23 +04:00
|
|
|
} else {
|
2003-02-21 23:57:09 +03:00
|
|
|
/* no timeout, wait forever */
|
2002-10-23 13:10:23 +04:00
|
|
|
timeout = 0;
|
2017-07-03 03:53:33 +03:00
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2016-01-31 07:40:01 +03:00
|
|
|
memset(&morker, 0, sizeof(morker));
|
2022-07-19 03:46:00 +03:00
|
|
|
marker = &morker.ki_knote;
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
marker->kn_kq = kq;
|
2008-03-22 00:53:35 +03:00
|
|
|
marker->kn_status = KN_MARKER;
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
retry:
|
2008-03-22 00:53:35 +03:00
|
|
|
kevp = kevbuf;
|
2021-05-02 22:13:43 +03:00
|
|
|
if (KQ_COUNT(kq) == 0) {
|
2008-03-22 00:53:35 +03:00
|
|
|
if (timeout >= 0) {
|
|
|
|
error = cv_timedwait_sig(&kq->kq_cv,
|
|
|
|
&kq->kq_lock, timeout);
|
|
|
|
if (error == 0) {
|
2021-05-02 22:13:43 +03:00
|
|
|
if (KQ_COUNT(kq) == 0 &&
|
|
|
|
(kq->kq_count & KQ_RESTART)) {
|
|
|
|
/* return to clear file reference */
|
|
|
|
error = ERESTART;
|
|
|
|
} else if (tsp == NULL || (timeout =
|
|
|
|
gettimeleft(&ats, &sleepts)) > 0) {
|
2008-03-22 00:53:35 +03:00
|
|
|
goto retry;
|
2021-05-02 22:13:43 +03:00
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
} else {
|
|
|
|
/* don't restart after signals... */
|
|
|
|
if (error == ERESTART)
|
|
|
|
error = EINTR;
|
|
|
|
if (error == EWOULDBLOCK)
|
|
|
|
error = 0;
|
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2017-07-01 23:08:56 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
2020-12-27 15:45:33 +03:00
|
|
|
goto done;
|
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
|
2020-12-27 15:45:33 +03:00
|
|
|
/* mark end of knote list */
|
|
|
|
TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
|
2021-01-21 00:39:09 +03:00
|
|
|
influx = 0;
|
2017-07-01 23:08:56 +03:00
|
|
|
|
2020-12-27 15:45:33 +03:00
|
|
|
/*
|
|
|
|
* Acquire the fdp->fd_lock interlock to avoid races with
|
|
|
|
* file creation/destruction from other threads.
|
|
|
|
*/
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
relock:
|
2020-12-27 15:45:33 +03:00
|
|
|
mutex_enter(&fdp->fd_lock);
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
|
|
|
|
while (count != 0) {
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
/*
|
|
|
|
* Get next knote. We are guaranteed this will never
|
|
|
|
* be NULL because of the marker we inserted above.
|
|
|
|
*/
|
|
|
|
kn = TAILQ_FIRST(&kq->kq_head);
|
2021-01-21 00:39:09 +03:00
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
bool kn_is_other_marker =
|
|
|
|
(kn->kn_status & KN_MARKER) != 0 && kn != marker;
|
|
|
|
bool kn_is_detaching = (kn->kn_status & KN_WILLDETACH) != 0;
|
|
|
|
bool kn_is_in_flux = kn_in_flux(kn);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we found a marker that's not ours, or this knote
|
|
|
|
* is in a state of flux, then wait for everything to
|
|
|
|
* settle down and go around again.
|
|
|
|
*/
|
|
|
|
if (kn_is_other_marker || kn_is_detaching || kn_is_in_flux) {
|
2021-01-21 00:39:09 +03:00
|
|
|
if (influx) {
|
|
|
|
influx = 0;
|
|
|
|
KQ_FLUX_WAKEUP(kq);
|
|
|
|
}
|
|
|
|
mutex_exit(&fdp->fd_lock);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
if (kn_is_other_marker || kn_is_in_flux) {
|
|
|
|
KQ_FLUX_WAIT(kq);
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Detaching but not in-flux? Someone is
|
|
|
|
* actively trying to finish the job; just
|
|
|
|
* go around and try again.
|
|
|
|
*/
|
|
|
|
KASSERT(kn_is_detaching);
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
preempt_point();
|
|
|
|
}
|
2021-01-21 00:39:09 +03:00
|
|
|
goto relock;
|
|
|
|
}
|
|
|
|
|
|
|
|
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
|
|
|
|
if (kn == marker) {
|
|
|
|
/* it's our marker, stop */
|
|
|
|
KQ_FLUX_WAKEUP(kq);
|
|
|
|
if (count == maxevents) {
|
2020-12-27 15:45:33 +03:00
|
|
|
mutex_exit(&fdp->fd_lock);
|
|
|
|
goto retry;
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
2021-01-21 00:39:09 +03:00
|
|
|
break;
|
2020-12-27 15:45:33 +03:00
|
|
|
}
|
2021-01-21 00:39:09 +03:00
|
|
|
KASSERT((kn->kn_status & KN_BUSY) == 0);
|
|
|
|
|
2020-12-27 15:45:33 +03:00
|
|
|
kq_check(kq);
|
2021-01-25 22:57:05 +03:00
|
|
|
kn->kn_status &= ~KN_QUEUED;
|
2020-12-27 15:45:33 +03:00
|
|
|
kn->kn_status |= KN_BUSY;
|
|
|
|
kq_check(kq);
|
|
|
|
if (kn->kn_status & KN_DISABLED) {
|
2021-01-25 22:57:05 +03:00
|
|
|
kn->kn_status &= ~KN_BUSY;
|
2021-01-21 00:39:09 +03:00
|
|
|
kq->kq_count--;
|
2020-12-27 15:45:33 +03:00
|
|
|
/* don't want disabled events */
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if ((kn->kn_flags & EV_ONESHOT) == 0) {
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
KASSERT(mutex_owned(&fdp->fd_lock));
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
knote_foplock_enter(kn);
|
|
|
|
rv = filter_event(kn, 0, false);
|
|
|
|
knote_foplock_exit(kn);
|
2020-12-27 15:45:33 +03:00
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
2021-01-25 22:57:05 +03:00
|
|
|
/* Re-poll if note was re-enqueued. */
|
|
|
|
if ((kn->kn_status & KN_QUEUED) != 0) {
|
|
|
|
kn->kn_status &= ~KN_BUSY;
|
|
|
|
/* Re-enqueue raised kq_count, lower it again */
|
|
|
|
kq->kq_count--;
|
|
|
|
influx = 1;
|
|
|
|
continue;
|
|
|
|
}
|
2020-12-27 15:45:33 +03:00
|
|
|
if (rv == 0) {
|
2020-10-31 04:08:31 +03:00
|
|
|
/*
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
* non-ONESHOT event that hasn't triggered
|
|
|
|
* again, so it will remain de-queued.
|
2020-10-31 04:08:31 +03:00
|
|
|
*/
|
2021-01-25 22:57:05 +03:00
|
|
|
kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
|
2021-01-21 00:39:09 +03:00
|
|
|
kq->kq_count--;
|
|
|
|
influx = 1;
|
2020-12-27 15:45:33 +03:00
|
|
|
continue;
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
} else {
|
|
|
|
/*
|
2021-10-23 21:46:26 +03:00
|
|
|
* Must NOT drop kq_lock until we can do
|
|
|
|
* the KNOTE_WILLDETACH() below.
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
*/
|
2020-12-27 15:45:33 +03:00
|
|
|
}
|
|
|
|
KASSERT(kn->kn_fop != NULL);
|
2021-09-26 04:16:07 +03:00
|
|
|
touch = (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
|
2020-12-27 15:45:33 +03:00
|
|
|
kn->kn_fop->f_touch != NULL);
|
|
|
|
/* XXXAD should be got from f_event if !oneshot. */
|
2021-10-23 21:46:26 +03:00
|
|
|
KASSERT((kn->kn_status & KN_WILLDETACH) == 0);
|
2020-12-27 15:45:33 +03:00
|
|
|
if (touch) {
|
2021-10-21 05:34:03 +03:00
|
|
|
(void)filter_touch(kn, kevp, EVENT_PROCESS);
|
2020-12-27 15:45:33 +03:00
|
|
|
} else {
|
|
|
|
*kevp = kn->kn_kevent;
|
|
|
|
}
|
|
|
|
kevp++;
|
|
|
|
nkev++;
|
2021-01-21 00:39:09 +03:00
|
|
|
influx = 1;
|
2020-12-27 15:45:33 +03:00
|
|
|
if (kn->kn_flags & EV_ONESHOT) {
|
|
|
|
/* delete ONESHOT events after retrieval */
|
2021-10-23 21:46:26 +03:00
|
|
|
KNOTE_WILLDETACH(kn);
|
2021-01-25 22:57:05 +03:00
|
|
|
kn->kn_status &= ~KN_BUSY;
|
2021-01-21 00:39:09 +03:00
|
|
|
kq->kq_count--;
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
KASSERT(kn_in_flux(kn) == false);
|
2023-04-09 12:18:09 +03:00
|
|
|
KASSERT((kn->kn_status & KN_WILLDETACH) != 0);
|
|
|
|
KASSERT(kn->kn_kevent.udata == curlwp);
|
2020-12-27 15:45:33 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
knote_detach(kn, fdp, true);
|
|
|
|
mutex_enter(&fdp->fd_lock);
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
} else if (kn->kn_flags & EV_CLEAR) {
|
|
|
|
/* clear state after retrieval */
|
|
|
|
kn->kn_data = 0;
|
|
|
|
kn->kn_fflags = 0;
|
|
|
|
/*
|
|
|
|
* Manually clear knotes who weren't
|
|
|
|
* 'touch'ed.
|
|
|
|
*/
|
|
|
|
if (touch == 0) {
|
|
|
|
kn->kn_data = 0;
|
|
|
|
kn->kn_fflags = 0;
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
2021-01-25 22:57:05 +03:00
|
|
|
kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
|
2021-01-21 00:39:09 +03:00
|
|
|
kq->kq_count--;
|
2020-12-27 15:45:33 +03:00
|
|
|
} else if (kn->kn_flags & EV_DISPATCH) {
|
|
|
|
kn->kn_status |= KN_DISABLED;
|
2021-01-25 22:57:05 +03:00
|
|
|
kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
|
2021-01-21 00:39:09 +03:00
|
|
|
kq->kq_count--;
|
2020-12-27 15:45:33 +03:00
|
|
|
} else {
|
|
|
|
/* add event back on list */
|
|
|
|
kq_check(kq);
|
2021-01-25 22:57:05 +03:00
|
|
|
kn->kn_status |= KN_QUEUED;
|
2020-12-27 15:45:33 +03:00
|
|
|
kn->kn_status &= ~KN_BUSY;
|
|
|
|
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
|
|
|
|
kq_check(kq);
|
|
|
|
}
|
2021-01-21 00:39:09 +03:00
|
|
|
|
2020-12-27 15:45:33 +03:00
|
|
|
if (nkev == kevcnt) {
|
|
|
|
/* do copyouts in kevcnt chunks */
|
2021-01-21 00:39:09 +03:00
|
|
|
influx = 0;
|
|
|
|
KQ_FLUX_WAKEUP(kq);
|
2020-12-27 15:45:33 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
mutex_exit(&fdp->fd_lock);
|
|
|
|
error = (*keops->keo_put_events)
|
|
|
|
(keops->keo_private,
|
|
|
|
kevbuf, ulistp, nevents, nkev);
|
|
|
|
mutex_enter(&fdp->fd_lock);
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
nevents += nkev;
|
|
|
|
nkev = 0;
|
|
|
|
kevp = kevbuf;
|
|
|
|
}
|
|
|
|
count--;
|
|
|
|
if (error != 0 || count == 0) {
|
|
|
|
/* remove marker */
|
|
|
|
TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
|
|
|
|
break;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2017-07-01 23:08:56 +03:00
|
|
|
}
|
2021-01-21 00:39:09 +03:00
|
|
|
KQ_FLUX_WAKEUP(kq);
|
2020-12-27 15:45:33 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
mutex_exit(&fdp->fd_lock);
|
|
|
|
|
|
|
|
done:
|
2008-03-22 00:53:35 +03:00
|
|
|
if (nkev != 0) {
|
2002-10-23 13:10:23 +04:00
|
|
|
/* copyout remaining events */
|
2005-10-23 05:33:32 +04:00
|
|
|
error = (*keops->keo_put_events)(keops->keo_private,
|
2008-03-22 00:53:35 +03:00
|
|
|
kevbuf, ulistp, nevents, nkev);
|
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
*retval = maxevents - count;
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
return error;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-03-22 00:53:35 +03:00
|
|
|
* fileops ioctl method for a kqueue descriptor.
|
2002-10-23 13:10:23 +04:00
|
|
|
*
|
|
|
|
* Two ioctls are currently supported. They both use struct kfilter_mapping:
|
|
|
|
* KFILTER_BYNAME find name for filter, and return result in
|
|
|
|
* name, which is of size len.
|
|
|
|
* KFILTER_BYFILTER find filter for name. len is ignored.
|
|
|
|
*/
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
2008-03-22 00:53:35 +03:00
|
|
|
kqueue_ioctl(file_t *fp, u_long com, void *data)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
|
|
|
struct kfilter_mapping *km;
|
|
|
|
const struct kfilter *kfilter;
|
|
|
|
char *name;
|
|
|
|
int error;
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
km = data;
|
2002-10-23 13:10:23 +04:00
|
|
|
error = 0;
|
2008-03-22 00:53:35 +03:00
|
|
|
name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
switch (com) {
|
|
|
|
case KFILTER_BYFILTER: /* convert filter -> name */
|
2008-03-22 00:53:35 +03:00
|
|
|
rw_enter(&kqueue_filter_lock, RW_READER);
|
2002-10-23 13:10:23 +04:00
|
|
|
kfilter = kfilter_byfilter(km->filter);
|
2008-03-22 00:53:35 +03:00
|
|
|
if (kfilter != NULL) {
|
|
|
|
strlcpy(name, kfilter->name, KFILTER_MAXNAME);
|
|
|
|
rw_exit(&kqueue_filter_lock);
|
|
|
|
error = copyoutstr(name, km->name, km->len, NULL);
|
|
|
|
} else {
|
|
|
|
rw_exit(&kqueue_filter_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
error = ENOENT;
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
break;
|
|
|
|
|
|
|
|
case KFILTER_BYNAME: /* convert name -> filter */
|
|
|
|
error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
|
|
|
|
if (error) {
|
|
|
|
break;
|
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
rw_enter(&kqueue_filter_lock, RW_READER);
|
2002-10-23 13:10:23 +04:00
|
|
|
kfilter = kfilter_byname(name);
|
|
|
|
if (kfilter != NULL)
|
|
|
|
km->filter = kfilter->filter;
|
|
|
|
else
|
|
|
|
error = ENOENT;
|
2008-03-22 00:53:35 +03:00
|
|
|
rw_exit(&kqueue_filter_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
error = ENOTTY;
|
2008-03-22 00:53:35 +03:00
|
|
|
break;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
kmem_free(name, KFILTER_MAXNAME);
|
2002-10-23 13:10:23 +04:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-03-22 00:53:35 +03:00
|
|
|
* fileops fcntl method for a kqueue descriptor.
|
2002-10-23 13:10:23 +04:00
|
|
|
*/
|
|
|
|
static int
|
2008-03-22 00:53:35 +03:00
|
|
|
kqueue_fcntl(file_t *fp, u_int com, void *data)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
|
|
|
|
|
|
|
return (ENOTTY);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-03-22 00:53:35 +03:00
|
|
|
* fileops poll method for a kqueue descriptor.
|
2002-10-23 13:10:23 +04:00
|
|
|
* Determine if kqueue has events pending.
|
|
|
|
*/
|
|
|
|
static int
|
2008-03-22 00:53:35 +03:00
|
|
|
kqueue_poll(file_t *fp, int events)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
|
|
|
struct kqueue *kq;
|
|
|
|
int revents;
|
|
|
|
|
2014-09-05 13:20:59 +04:00
|
|
|
kq = fp->f_kqueue;
|
2008-03-22 00:53:35 +03:00
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
revents = 0;
|
|
|
|
if (events & (POLLIN | POLLRDNORM)) {
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
2021-05-02 22:13:43 +03:00
|
|
|
if (KQ_COUNT(kq) != 0) {
|
2002-10-23 13:10:23 +04:00
|
|
|
revents |= events & (POLLIN | POLLRDNORM);
|
|
|
|
} else {
|
2008-03-22 00:53:35 +03:00
|
|
|
selrecord(curlwp, &kq->kq_sel);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2008-03-24 12:09:55 +03:00
|
|
|
kq_check(kq);
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
|
|
|
|
return revents;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-03-22 00:53:35 +03:00
|
|
|
* fileops stat method for a kqueue descriptor.
|
2002-10-23 13:10:23 +04:00
|
|
|
* Returns dummy info, with st_size being number of events pending.
|
|
|
|
*/
|
|
|
|
static int
|
2008-03-22 00:53:35 +03:00
|
|
|
kqueue_stat(file_t *fp, struct stat *st)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
struct kqueue *kq;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2014-09-05 13:20:59 +04:00
|
|
|
kq = fp->f_kqueue;
|
2008-03-22 00:53:35 +03:00
|
|
|
|
|
|
|
memset(st, 0, sizeof(*st));
|
2021-05-02 22:13:43 +03:00
|
|
|
st->st_size = KQ_COUNT(kq);
|
2002-10-23 13:10:23 +04:00
|
|
|
st->st_blksize = sizeof(struct kevent);
|
2021-09-30 04:20:53 +03:00
|
|
|
st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
|
|
|
|
st->st_blocks = 1;
|
|
|
|
st->st_uid = kauth_cred_geteuid(fp->f_cred);
|
|
|
|
st->st_gid = kauth_cred_getegid(fp->f_cred);
|
2008-03-22 00:53:35 +03:00
|
|
|
|
|
|
|
return 0;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
static void
|
|
|
|
kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
|
|
|
|
{
|
|
|
|
struct knote *kn;
|
|
|
|
filedesc_t *fdp;
|
|
|
|
|
|
|
|
fdp = kq->kq_fdp;
|
|
|
|
|
|
|
|
KASSERT(mutex_owned(&fdp->fd_lock));
|
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
again:
|
2008-03-22 00:53:35 +03:00
|
|
|
for (kn = SLIST_FIRST(list); kn != NULL;) {
|
|
|
|
if (kq != kn->kn_kq) {
|
|
|
|
kn = SLIST_NEXT(kn, kn_link);
|
|
|
|
continue;
|
|
|
|
}
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
if (knote_detach_quiesce(kn)) {
|
|
|
|
mutex_enter(&fdp->fd_lock);
|
|
|
|
goto again;
|
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
knote_detach(kn, fdp, true);
|
|
|
|
mutex_enter(&fdp->fd_lock);
|
|
|
|
kn = SLIST_FIRST(list);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
/*
|
2008-03-22 00:53:35 +03:00
|
|
|
* fileops close method for a kqueue descriptor.
|
2002-10-23 13:10:23 +04:00
|
|
|
*/
|
|
|
|
static int
|
2008-03-22 00:53:35 +03:00
|
|
|
kqueue_close(file_t *fp)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
struct kqueue *kq;
|
|
|
|
filedesc_t *fdp;
|
|
|
|
fdfile_t *ff;
|
|
|
|
int i;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2014-09-05 13:20:59 +04:00
|
|
|
kq = fp->f_kqueue;
|
|
|
|
fp->f_kqueue = NULL;
|
2012-11-24 19:14:32 +04:00
|
|
|
fp->f_type = 0;
|
2008-03-22 00:53:35 +03:00
|
|
|
fdp = curlwp->l_fd;
|
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
KASSERT(kq->kq_fdp == fdp);
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_enter(&fdp->fd_lock);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We're doing to drop the fd_lock multiple times while
|
|
|
|
* we detach knotes. During this time, attempts to register
|
|
|
|
* knotes via the back door (e.g. knote_proc_fork_track())
|
|
|
|
* need to fail, lest they sneak in to attach a knote after
|
|
|
|
* we've already drained the list it's destined for.
|
|
|
|
*
|
2022-01-01 13:54:21 +03:00
|
|
|
* We must acquire kq_lock here to set KQ_CLOSING (to serialize
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
* with other code paths that modify kq_count without holding
|
|
|
|
* the fd_lock), but once this bit is set, it's only safe to
|
|
|
|
* test it while holding the fd_lock, and holding kq_lock while
|
|
|
|
* doing so is not necessary.
|
|
|
|
*/
|
|
|
|
mutex_enter(&kq->kq_lock);
|
|
|
|
kq->kq_count |= KQ_CLOSING;
|
|
|
|
mutex_exit(&kq->kq_lock);
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
for (i = 0; i <= fdp->fd_lastkqfile; i++) {
|
2009-05-25 01:41:25 +04:00
|
|
|
if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
|
2008-03-22 00:53:35 +03:00
|
|
|
continue;
|
|
|
|
kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
if (fdp->fd_knhashmask != 0) {
|
|
|
|
for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
|
2008-03-22 00:53:35 +03:00
|
|
|
kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
}
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_exit(&fdp->fd_lock);
|
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
#if defined(DEBUG)
|
|
|
|
mutex_enter(&kq->kq_lock);
|
|
|
|
kq_check(kq);
|
|
|
|
mutex_exit(&kq->kq_lock);
|
|
|
|
#endif /* DEBUG */
|
|
|
|
KASSERT(TAILQ_EMPTY(&kq->kq_head));
|
2021-05-02 22:13:43 +03:00
|
|
|
KASSERT(KQ_COUNT(kq) == 0);
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_destroy(&kq->kq_lock);
|
|
|
|
cv_destroy(&kq->kq_cv);
|
2008-03-01 17:16:49 +03:00
|
|
|
seldestroy(&kq->kq_sel);
|
2008-03-22 00:53:35 +03:00
|
|
|
kmem_free(kq, sizeof(*kq));
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* struct fileops kqfilter method for a kqueue descriptor.
|
|
|
|
* Event triggered when monitored kqueue changes.
|
|
|
|
*/
|
|
|
|
static int
|
2008-03-22 00:53:35 +03:00
|
|
|
kqueue_kqfilter(file_t *fp, struct knote *kn)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
|
|
|
struct kqueue *kq;
|
2008-03-22 00:53:35 +03:00
|
|
|
|
2014-09-05 13:20:59 +04:00
|
|
|
kq = ((file_t *)kn->kn_obj)->f_kqueue;
|
2008-03-22 00:53:35 +03:00
|
|
|
|
|
|
|
KASSERT(fp == kn->kn_obj);
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
if (kn->kn_filter != EVFILT_READ)
|
2021-09-27 02:37:40 +03:00
|
|
|
return EINVAL;
|
2008-03-22 00:53:35 +03:00
|
|
|
|
2002-10-23 13:10:23 +04:00
|
|
|
kn->kn_fop = &kqread_filtops;
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_enter(&kq->kq_lock);
|
2020-12-11 06:00:09 +03:00
|
|
|
selrecord_knote(&kq->kq_sel, kn);
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_exit(&kq->kq_lock);
|
|
|
|
|
|
|
|
return 0;
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
2008-03-22 00:53:35 +03:00
|
|
|
* Walk down a list of knotes, activating them if their event has
|
|
|
|
* triggered. The caller's object lock (e.g. device driver lock)
|
|
|
|
* must be held.
|
2002-10-23 13:10:23 +04:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
knote(struct klist *list, long hint)
|
|
|
|
{
|
2010-09-10 14:23:46 +04:00
|
|
|
struct knote *kn, *tmpkn;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2010-09-10 14:23:46 +04:00
|
|
|
SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
/*
|
|
|
|
* We assume here that the backing object's lock is
|
|
|
|
* already held if we're traversing the klist, and
|
|
|
|
* so acquiring the knote foplock would create a
|
|
|
|
* deadlock scenario. But we also know that the klist
|
|
|
|
* won't disappear on us while we're here, so not
|
|
|
|
* acquiring it is safe.
|
|
|
|
*/
|
|
|
|
if (filter_event(kn, hint, true)) {
|
2008-03-22 00:53:35 +03:00
|
|
|
knote_activate(kn);
|
2021-09-30 04:12:06 +03:00
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-03-22 00:53:35 +03:00
|
|
|
* Remove all knotes referencing a specified fd
|
2002-10-23 13:10:23 +04:00
|
|
|
*/
|
|
|
|
void
|
2008-03-22 00:53:35 +03:00
|
|
|
knote_fdclose(int fd)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
struct klist *list;
|
2002-10-23 13:10:23 +04:00
|
|
|
struct knote *kn;
|
2008-03-22 00:53:35 +03:00
|
|
|
filedesc_t *fdp;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
again:
|
2008-03-22 00:53:35 +03:00
|
|
|
fdp = curlwp->l_fd;
|
|
|
|
mutex_enter(&fdp->fd_lock);
|
2020-02-01 05:23:03 +03:00
|
|
|
list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
|
2002-10-23 13:10:23 +04:00
|
|
|
while ((kn = SLIST_FIRST(list)) != NULL) {
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
if (knote_detach_quiesce(kn)) {
|
|
|
|
goto again;
|
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
knote_detach(kn, fdp, true);
|
|
|
|
mutex_enter(&fdp->fd_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_exit(&fdp->fd_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-03-22 00:53:35 +03:00
|
|
|
* Drop knote. Called with fdp->fd_lock held, and will drop before
|
|
|
|
* returning.
|
2002-10-23 13:10:23 +04:00
|
|
|
*/
|
|
|
|
static void
|
2008-03-22 00:53:35 +03:00
|
|
|
knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
|
2002-10-23 13:10:23 +04:00
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
struct klist *list;
|
2008-03-26 16:32:32 +03:00
|
|
|
struct kqueue *kq;
|
|
|
|
|
|
|
|
kq = kn->kn_kq;
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
KASSERT((kn->kn_status & KN_MARKER) == 0);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
KASSERT((kn->kn_status & KN_WILLDETACH) != 0);
|
|
|
|
KASSERT(kn->kn_fop != NULL);
|
2008-03-22 00:53:35 +03:00
|
|
|
KASSERT(mutex_owned(&fdp->fd_lock));
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2008-03-26 16:32:32 +03:00
|
|
|
/* Remove from monitored object. */
|
2008-03-22 00:53:35 +03:00
|
|
|
if (dofop) {
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
knote_foplock_enter(kn);
|
2021-09-26 06:12:50 +03:00
|
|
|
filter_detach(kn);
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
knote_foplock_exit(kn);
|
2008-03-22 00:53:35 +03:00
|
|
|
}
|
2002-10-23 13:10:23 +04:00
|
|
|
|
2008-03-26 16:32:32 +03:00
|
|
|
/* Remove from descriptor table. */
|
2021-09-26 04:16:07 +03:00
|
|
|
if (kn->kn_fop->f_flags & FILTEROP_ISFD)
|
2009-05-25 01:41:25 +04:00
|
|
|
list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
|
2002-10-23 13:10:23 +04:00
|
|
|
else
|
|
|
|
list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
|
|
|
|
|
|
|
|
SLIST_REMOVE(list, kn, knote, kn_link);
|
2008-03-26 16:32:32 +03:00
|
|
|
|
|
|
|
/* Remove from kqueue. */
|
2016-01-31 07:40:01 +03:00
|
|
|
again:
|
2008-03-26 16:32:32 +03:00
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
KASSERT(kn_in_flux(kn) == false);
|
2008-03-26 16:32:32 +03:00
|
|
|
if ((kn->kn_status & KN_QUEUED) != 0) {
|
|
|
|
kq_check(kq);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
KASSERT(KQ_COUNT(kq) != 0);
|
2016-01-31 07:40:01 +03:00
|
|
|
kq->kq_count--;
|
2008-03-26 16:32:32 +03:00
|
|
|
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
|
|
|
|
kn->kn_status &= ~KN_QUEUED;
|
|
|
|
kq_check(kq);
|
2016-01-31 07:40:01 +03:00
|
|
|
} else if (kn->kn_status & KN_BUSY) {
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
goto again;
|
2008-03-26 16:32:32 +03:00
|
|
|
}
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_exit(&fdp->fd_lock);
|
2021-09-26 04:16:07 +03:00
|
|
|
if (kn->kn_fop->f_flags & FILTEROP_ISFD)
|
2008-03-22 00:53:35 +03:00
|
|
|
fd_putfile(kn->kn_id);
|
|
|
|
atomic_dec_uint(&kn->kn_kfilter->refcnt);
|
2022-07-13 06:23:07 +03:00
|
|
|
knote_free(kn);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Queue new event for knote.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
knote_enqueue(struct knote *kn)
|
|
|
|
{
|
2008-03-22 00:53:35 +03:00
|
|
|
struct kqueue *kq;
|
|
|
|
|
|
|
|
KASSERT((kn->kn_status & KN_MARKER) == 0);
|
2002-10-23 13:10:23 +04:00
|
|
|
|
|
|
|
kq = kn->kn_kq;
|
2008-03-22 00:53:35 +03:00
|
|
|
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
if (__predict_false(kn->kn_status & KN_WILLDETACH)) {
|
|
|
|
/* Don't bother enqueueing a dying knote. */
|
|
|
|
goto out;
|
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
if ((kn->kn_status & KN_DISABLED) != 0) {
|
|
|
|
kn->kn_status &= ~KN_DISABLED;
|
|
|
|
}
|
|
|
|
if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
|
2008-03-24 12:09:55 +03:00
|
|
|
kq_check(kq);
|
2008-03-22 00:53:35 +03:00
|
|
|
kn->kn_status |= KN_QUEUED;
|
2016-01-31 07:40:01 +03:00
|
|
|
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT);
|
2008-03-22 00:53:35 +03:00
|
|
|
kq->kq_count++;
|
2008-03-24 12:09:55 +03:00
|
|
|
kq_check(kq);
|
2008-03-22 00:53:35 +03:00
|
|
|
cv_broadcast(&kq->kq_cv);
|
|
|
|
selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
|
|
|
|
}
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
out:
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
/*
|
|
|
|
* Queue new event for knote.
|
|
|
|
*/
|
|
|
|
static void
|
2021-10-21 03:54:15 +03:00
|
|
|
knote_activate_locked(struct knote *kn)
|
2008-03-22 00:53:35 +03:00
|
|
|
{
|
|
|
|
struct kqueue *kq;
|
|
|
|
|
|
|
|
KASSERT((kn->kn_status & KN_MARKER) == 0);
|
|
|
|
|
|
|
|
kq = kn->kn_kq;
|
|
|
|
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
if (__predict_false(kn->kn_status & KN_WILLDETACH)) {
|
|
|
|
/* Don't bother enqueueing a dying knote. */
|
2021-10-21 03:54:15 +03:00
|
|
|
return;
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
}
|
2008-03-22 00:53:35 +03:00
|
|
|
kn->kn_status |= KN_ACTIVE;
|
|
|
|
if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
|
2008-03-24 12:09:55 +03:00
|
|
|
kq_check(kq);
|
2008-03-22 00:53:35 +03:00
|
|
|
kn->kn_status |= KN_QUEUED;
|
2016-01-31 07:40:01 +03:00
|
|
|
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
|
Changes to make EVFILT_PROC MP-safe:
Because the locking protocol around processes is somewhat complex
compared to other events that can be posted on kqueues, introduce
new functions for posting NOTE_EXEC, NOTE_EXIT, and NOTE_FORK,
rather than just using the generic knote() function. These functions
KASSERT() their locking expectations, and deal with other complexities
for each situation.
knote_proc_fork(), in particiular, needs to handle NOTE_TRACK, which
requires allocation of a new knote to attach to the child process. We
don't want to be allocating memory while holding the parent's p_lock.
Furthermore, we also have to attach the tracking note to the child
process, which means we have to acquire the child's p_lock.
So, to handle all this, we introduce some additional synchronization
infrastructure around the 'knote' structure:
- Add the ability to mark a knote as being in a state of flux. Knotes
in this state are guaranteed not to be detached/deleted, thus allowing
a code path drop other locks after putting a knote in this state.
- Code paths that wish to detach/delete a knote must first check if the
knote is in-flux. If so, they must wait for it to quiesce. Because
multiple threads of execution may attempt this concurrently, a mechanism
exists for a single LWP to claim the detach responsibility; all other
threads simply wait for the knote to disappear before they can make
further progress.
- When kqueue_scan() encounters an in-flux knote, it simply treats the
situation just like encountering another thread's queue marker -- wait
for the flux to settle and continue on.
(The "in-flux knote" idea was inspired by FreeBSD, but this works differently
from their implementation, as the two kqueue implementations have diverged
quite a bit.)
knote_proc_fork() uses this infrastructure to implement NOTE_TRACK like so:
- Attempt to put the original tracking knote into a state of flux; if this
fails (because the note has a detach pending), we skip all processing
(the original process has lost interest, and we simply won the race).
- Once the note is in-flux, drop the kq and forking process's locks, and
allocate 2 knotes: one to post the NOTE_CHILD event, and one to attach
a new NOTE_TRACK to the child process. Notably, we do NOT go through
kqueue_register() to do this, but rather do all of the work directly
and KASSERT() our assumptions; this allows us to directly control our
interaction with locks. All memory allocations here are performed with
KM_NOSLEEP, in order to prevent holding the original knote in-flux
indefinitely.
- Because the NOTE_TRACK use case adds knotes to kqueues through a
sort of back-door mechanism, we must serialize with the closing of
the destination kqueue's file descriptor, so steal another bit from
the kq_count field to notify other threads that a kqueue is on its
way out to prevent new knotes from being enqueued while the close
path detaches them.
In addition to fixing EVFILT_PROC's reliance on KERNEL_LOCK, this also
fixes a long-standing bug whereby a NOTE_CHILD event could be dropped
if the child process exited before the interested process received the
NOTE_CHILD event (the same knote would be used to deliver the NOTE_EXIT
event, and would clobber the NOTE_CHILD's 'data' field).
Add a bunch of comments to explain what's going on in various critical
sections, and sprinkle additional KASSERT()s to validate assumptions
in several more locations.
2021-10-10 21:07:51 +03:00
|
|
|
KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT);
|
2008-03-22 00:53:35 +03:00
|
|
|
kq->kq_count++;
|
2008-03-24 12:09:55 +03:00
|
|
|
kq_check(kq);
|
2008-03-22 00:53:35 +03:00
|
|
|
cv_broadcast(&kq->kq_cv);
|
|
|
|
selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
|
|
|
|
}
|
2021-10-21 03:54:15 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
knote_activate(struct knote *kn)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
knote_activate_locked(kn);
|
2008-03-22 00:53:35 +03:00
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
2002-10-23 13:10:23 +04:00
|
|
|
}
|
2021-10-11 04:07:36 +03:00
|
|
|
|
2021-10-22 07:49:24 +03:00
|
|
|
static void
|
|
|
|
knote_deactivate_locked(struct knote *kn)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
|
|
|
|
if (kn->kn_status & KN_QUEUED) {
|
|
|
|
kq_check(kq);
|
|
|
|
kn->kn_status &= ~KN_QUEUED;
|
|
|
|
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
|
|
|
|
KASSERT(KQ_COUNT(kq) > 0);
|
|
|
|
kq->kq_count--;
|
|
|
|
kq_check(kq);
|
|
|
|
}
|
|
|
|
kn->kn_status &= ~KN_ACTIVE;
|
|
|
|
}
|
|
|
|
|
2021-10-11 04:07:36 +03:00
|
|
|
/*
|
|
|
|
* Set EV_EOF on the specified knote. Also allows additional
|
|
|
|
* EV_* flags to be set (e.g. EV_ONESHOT).
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
knote_set_eof(struct knote *kn, uint32_t flags)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
kn->kn_flags |= EV_EOF | flags;
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear EV_EOF on the specified knote.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
knote_clear_eof(struct knote *kn)
|
|
|
|
{
|
|
|
|
struct kqueue *kq = kn->kn_kq;
|
|
|
|
|
|
|
|
mutex_spin_enter(&kq->kq_lock);
|
|
|
|
kn->kn_flags &= ~EV_EOF;
|
|
|
|
mutex_spin_exit(&kq->kq_lock);
|
|
|
|
}
|
2022-07-13 17:11:46 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize a klist.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
klist_init(struct klist *list)
|
|
|
|
{
|
|
|
|
SLIST_INIT(list);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Finalize a klist.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
klist_fini(struct klist *list)
|
|
|
|
{
|
Fix a problem whereby detaching a device that has open kevent
registrations can result in a UAF: When a device detaches, it
calls seldestroy(), which calls knote_fini(), and when that
returns, the softc that contained the selinfo and klist are freed.
However, any knotes that were registered still linger on with the
kq descriptor they're were associated with, and when the file
descriptors close, those knotes will be f_detach'd, which will
call into the driver instance that no longer exists.
Address this problem by adding a "foplock" mutex to the knote.
This foplock must be held when calling into filter_attach(),
filter_detach(), and filter_event() (XXX not filter_touch();
see code for details). Now, in klist_fini(), for each knote
that is on the klist that's about to be blown away, acquire
the foplock, replace the knote's filterops with a do-nothing
stub, and release the foplock.
The end result is that:
==> The foplock ensures that calls into filter_*() will get EITHER
the real backing object's filterops OR the nop stubs.
==> Holing the foplock across the filter_*() calls ensures that
klist_fini() will not complete until there are no callers inside
the filterops that are about to be blown away.
2022-07-19 04:03:05 +03:00
|
|
|
struct knote *kn;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Neuter all existing knotes on the klist because the list is
|
|
|
|
* being destroyed. The caller has guaranteed that no additional
|
|
|
|
* knotes will be added to the list, that the backing object's
|
|
|
|
* locks are not held (otherwise there is a locking order issue
|
|
|
|
* with acquiring the knote foplock ), and that we can traverse
|
|
|
|
* the list safely in this state.
|
|
|
|
*/
|
|
|
|
SLIST_FOREACH(kn, list, kn_selnext) {
|
|
|
|
knote_foplock_enter(kn);
|
|
|
|
KASSERT(kn->kn_fop != NULL);
|
|
|
|
if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
|
|
|
|
kn->kn_fop = &nop_fd_filtops;
|
|
|
|
} else {
|
|
|
|
kn->kn_fop = &nop_filtops;
|
|
|
|
}
|
|
|
|
knote_foplock_exit(kn);
|
|
|
|
}
|
2022-07-13 17:11:46 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Insert a knote into a klist.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
klist_insert(struct klist *list, struct knote *kn)
|
|
|
|
{
|
|
|
|
SLIST_INSERT_HEAD(list, kn, kn_selnext);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove a knote from a klist. Returns true if the last
|
|
|
|
* knote was removed and the list is now empty.
|
|
|
|
*/
|
|
|
|
bool
|
|
|
|
klist_remove(struct klist *list, struct knote *kn)
|
|
|
|
{
|
|
|
|
SLIST_REMOVE(list, kn, knote, kn_selnext);
|
|
|
|
return SLIST_EMPTY(list);
|
|
|
|
}
|