Deal with a couple of problems with threads being awoken early due to

timeouts or cancellation where:

- The restarting thread calls _lwp_exit() before another thread gets around
  to waking it with _lwp_unpark(), leading to ESRCH (observed by joerg@).
  (I may have removed a similar check mistakenly over the weekend.)

- The restarting thread considers itself gone off the sleep queue but
  at the same time another thread is part way through waking it, and hasn't
  fully completed that operation yet by setting thread->pt_mutexwait = 0.
  I think that could have potentially lead to the list of waiters getting
  messed up given the right circumstances.
This commit is contained in:
ad 2020-06-03 22:10:24 +00:00
parent 5754e74670
commit 051faad4aa
3 changed files with 60 additions and 24 deletions

View File

@ -1,4 +1,4 @@
/* $NetBSD: pthread.c,v 1.172 2020/06/02 00:29:53 joerg Exp $ */
/* $NetBSD: pthread.c,v 1.173 2020/06/03 22:10:24 ad Exp $ */
/*-
* Copyright (c) 2001, 2002, 2003, 2006, 2007, 2008, 2020
@ -31,7 +31,7 @@
*/
#include <sys/cdefs.h>
__RCSID("$NetBSD: pthread.c,v 1.172 2020/06/02 00:29:53 joerg Exp $");
__RCSID("$NetBSD: pthread.c,v 1.173 2020/06/03 22:10:24 ad Exp $");
#define __EXPOSE_STACK 1
@ -599,9 +599,12 @@ pthread_resume_np(pthread_t thread)
}
/*
* In case the thread is exiting at an inopportune time leaving waiters not
* awoken (because cancelled, for instance) make sure we have no waiters
* left.
* Wake all deferred waiters hanging off self.
*
* It's possible for threads to have called _lwp_exit() before we wake them,
* because of cancellation and timeout, so ESRCH is tolerated here. If a
* thread exits and its LID is reused, and the a thread receives an wakeup
* meant for the previous incarnation of the LID, no harm will be done.
*/
void
pthread__clear_waiters(pthread_t self)
@ -620,7 +623,7 @@ pthread__clear_waiters(pthread_t self)
rv = _lwp_unpark(self->pt_waiters[0], NULL);
self->pt_waiters[0] = 0;
self->pt_nwaiters = 0;
if (rv != 0) {
if (rv != 0 && errno != ESRCH) {
pthread__errorfunc(__FILE__, __LINE__, __func__,
"_lwp_unpark failed: %d", errno);
}
@ -629,7 +632,7 @@ pthread__clear_waiters(pthread_t self)
rv = _lwp_unpark_all(self->pt_waiters, self->pt_nwaiters, NULL);
self->pt_waiters[0] = 0;
self->pt_nwaiters = 0;
if (rv != 0) {
if (rv != 0 && errno != ESRCH) {
pthread__errorfunc(__FILE__, __LINE__, __func__,
"_lwp_unpark_all failed: %d", errno);
}
@ -1195,6 +1198,7 @@ pthread__park(pthread_t self, pthread_mutex_t *lock,
switch (rv = errno) {
case EINTR:
case EALREADY:
case ESRCH:
rv = 0;
break;
case ETIMEDOUT:

View File

@ -1,4 +1,4 @@
/* $NetBSD: pthread_cond.c,v 1.70 2020/06/01 11:44:59 ad Exp $ */
/* $NetBSD: pthread_cond.c,v 1.71 2020/06/03 22:10:24 ad Exp $ */
/*-
* Copyright (c) 2001, 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc.
@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
__RCSID("$NetBSD: pthread_cond.c,v 1.70 2020/06/01 11:44:59 ad Exp $");
__RCSID("$NetBSD: pthread_cond.c,v 1.71 2020/06/03 22:10:24 ad Exp $");
#include <stdlib.h>
#include <errno.h>
@ -112,7 +112,7 @@ pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
const struct timespec *abstime)
{
pthread_t self, next, waiters;
int retval;
int retval, cancel;
clockid_t clkid = pthread_cond_getclock(cond);
if (__predict_false(__uselibcstub))
@ -126,6 +126,7 @@ pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
mutex->ptm_owner != NULL);
self = pthread__self();
pthread__assert(!self->pt_condwait);
if (__predict_false(self->pt_cancel)) {
pthread__cancelled();
@ -165,24 +166,42 @@ pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
retval = errno;
}
}
} while (self->pt_condwait && !self->pt_cancel && !retval);
cancel = self->pt_cancel;
} while (self->pt_condwait && !cancel && !retval);
/*
* If we have cancelled then exit. POSIX dictates that
* the mutex must be held when we action the cancellation.
* If this thread absorbed a wakeup from pthread_cond_signal() and
* cannot take the wakeup, we must ensure that another thread does.
*
* If we absorbed a pthread_cond_signal() and cannot take
* the wakeup, we must ensure that another thread does.
* And if awoken early, we may still be on the waiter list and must
* remove self.
*
* If awoke early, we may still be on the waiter list and
* must remove ourself.
* In all cases do the wakeup without the mutex held otherwise:
*
* - wakeup could be deferred until mutex release
* - it would be mixing up two sets of waitpoints
*/
if (__predict_false(self->pt_condwait | cancel | retval)) {
pthread_cond_broadcast(cond);
/*
* Might have raced with another thread to do the wakeup.
* In any case there will be a wakeup for sure. Eat it and
* wait for pt_condwait to clear.
*/
do {
(void)_lwp_park(CLOCK_REALTIME, TIMER_ABSTIME, NULL,
0, NULL, NULL);
} while (self->pt_condwait);
}
/*
* If cancelled then exit. POSIX dictates that the mutex must be
* held if this happens.
*/
pthread_mutex_lock(mutex);
if (__predict_false(self->pt_condwait | self->pt_cancel | retval)) {
pthread_cond_broadcast(cond);
if (self->pt_cancel) {
pthread__cancelled();
}
if (cancel) {
pthread__cancelled();
}
return retval;

View File

@ -1,4 +1,4 @@
/* $NetBSD: pthread_mutex.c,v 1.78 2020/06/01 11:44:59 ad Exp $ */
/* $NetBSD: pthread_mutex.c,v 1.79 2020/06/03 22:10:24 ad Exp $ */
/*-
* Copyright (c) 2001, 2003, 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc.
@ -47,7 +47,7 @@
*/
#include <sys/cdefs.h>
__RCSID("$NetBSD: pthread_mutex.c,v 1.78 2020/06/01 11:44:59 ad Exp $");
__RCSID("$NetBSD: pthread_mutex.c,v 1.79 2020/06/03 22:10:24 ad Exp $");
#include <sys/types.h>
#include <sys/lwpctl.h>
@ -278,6 +278,7 @@ pthread__mutex_lock_slow(pthread_mutex_t *ptm, const struct timespec *ts)
serrno = errno;
pthread__assert(!self->pt_willpark);
pthread__assert(!self->pt_mutexwait);
/* Recursive or errorcheck? */
if (MUTEX_OWNER(owner) == (uintptr_t)self) {
@ -369,6 +370,18 @@ pthread__mutex_lock_slow(pthread_mutex_t *ptm, const struct timespec *ts)
if (error < 0 && errno == ETIMEDOUT) {
/* Remove self from waiters list */
pthread__mutex_wakeup(self, ptm);
/*
* Might have raced with another thread to
* do the wakeup. In any case there will be
* a wakeup for sure. Eat it and wait for
* pt_mutexwait to clear.
*/
do {
(void)_lwp_park(CLOCK_REALTIME,
TIMER_ABSTIME, NULL, 0, NULL, NULL);
} while (self->pt_mutexwait);
/* Priority protect */
if (MUTEX_PROTECT(owner))
(void)_sched_protect(-1);