396f9f4598
Reconmap used to have one pointer for every reconstruction unit. This does not scale well in the land of 1TB disks, where some 100MB+ of "status pointers" are required for typical configurations. Convert the reconstruction code to use a "sliding status window" which will scale nicely regardless of the number of stripes/reconstruction units in the RAID set. Convert the main reconstruction loop to rebuild the array in chunks rather than in one big lump. As part of these changes, introduce a function to kick any waiters on the head separation callback list, and use that in the main reconstruction event queue to wake up the waiters if things have stalled. (I believe this may fix a race condition that could occur at at least at the very end of a disk during reconstruction under heavy IO load.) Thanks to Brian Buhrow for all his help, support, and patience in testing these changes.
227 lines
6.4 KiB
C
227 lines
6.4 KiB
C
/* $NetBSD: rf_revent.c,v 1.25 2008/05/19 19:49:55 oster Exp $ */
|
|
/*
|
|
* Copyright (c) 1995 Carnegie-Mellon University.
|
|
* All rights reserved.
|
|
*
|
|
* Author:
|
|
*
|
|
* Permission to use, copy, modify and distribute this software and
|
|
* its documentation is hereby granted, provided that both the copyright
|
|
* notice and this permission notice appear in all copies of the
|
|
* software, derivative works or modified versions, and any portions
|
|
* thereof, and that both notices appear in supporting documentation.
|
|
*
|
|
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
|
|
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
|
|
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
|
|
*
|
|
* Carnegie Mellon requests users of this software to return to
|
|
*
|
|
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
|
|
* School of Computer Science
|
|
* Carnegie Mellon University
|
|
* Pittsburgh PA 15213-3890
|
|
*
|
|
* any improvements or extensions that they make and grant Carnegie the
|
|
* rights to redistribute these changes.
|
|
*/
|
|
/*
|
|
* revent.c -- reconstruction event handling code
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__KERNEL_RCSID(0, "$NetBSD: rf_revent.c,v 1.25 2008/05/19 19:49:55 oster Exp $");
|
|
|
|
#include <sys/errno.h>
|
|
|
|
#include "rf_raid.h"
|
|
#include "rf_revent.h"
|
|
#include "rf_etimer.h"
|
|
#include "rf_general.h"
|
|
#include "rf_desc.h"
|
|
#include "rf_shutdown.h"
|
|
|
|
#define RF_MAX_FREE_REVENT 128
|
|
#define RF_MIN_FREE_REVENT 32
|
|
#define RF_EVENTQ_WAIT 5000
|
|
|
|
#include <sys/proc.h>
|
|
#include <sys/kernel.h>
|
|
|
|
static void rf_ShutdownReconEvent(void *);
|
|
|
|
static RF_ReconEvent_t *
|
|
GetReconEventDesc(RF_RowCol_t col, void *arg, RF_Revent_t type);
|
|
|
|
static void rf_ShutdownReconEvent(void *ignored)
|
|
{
|
|
pool_destroy(&rf_pools.revent);
|
|
}
|
|
|
|
int
|
|
rf_ConfigureReconEvent(RF_ShutdownList_t **listp)
|
|
{
|
|
|
|
rf_pool_init(&rf_pools.revent, sizeof(RF_ReconEvent_t),
|
|
"rf_revent_pl", RF_MIN_FREE_REVENT, RF_MAX_FREE_REVENT);
|
|
rf_ShutdownCreate(listp, rf_ShutdownReconEvent, NULL);
|
|
|
|
return (0);
|
|
}
|
|
|
|
/* returns the next reconstruction event, blocking the calling thread
|
|
* until one becomes available. will now return null if it is blocked
|
|
* or will return an event if it is not */
|
|
|
|
RF_ReconEvent_t *
|
|
rf_GetNextReconEvent(RF_RaidReconDesc_t *reconDesc)
|
|
{
|
|
RF_Raid_t *raidPtr = reconDesc->raidPtr;
|
|
RF_ReconCtrl_t *rctrl = raidPtr->reconControl;
|
|
RF_ReconEvent_t *event;
|
|
int stall_count;
|
|
|
|
RF_LOCK_MUTEX(rctrl->eq_mutex);
|
|
/* q null and count==0 must be equivalent conditions */
|
|
RF_ASSERT((rctrl->eventQueue == NULL) == (rctrl->eq_count == 0));
|
|
|
|
/* mpsleep timeout value: secs = timo_val/hz. 'ticks' here is
|
|
defined as cycle-counter ticks, not softclock ticks */
|
|
|
|
#define MAX_RECON_EXEC_USECS (100 * 1000) /* 100 ms */
|
|
#define RECON_DELAY_MS 25
|
|
#define RECON_TIMO ((RECON_DELAY_MS * hz) / 1000)
|
|
|
|
/* we are not pre-emptible in the kernel, but we don't want to run
|
|
* forever. If we run w/o blocking for more than MAX_RECON_EXEC_TICKS
|
|
* ticks of the cycle counter, delay for RECON_DELAY before
|
|
* continuing. this may murder us with context switches, so we may
|
|
* need to increase both the MAX...TICKS and the RECON_DELAY_MS. */
|
|
if (reconDesc->reconExecTimerRunning) {
|
|
int status;
|
|
|
|
RF_ETIMER_STOP(reconDesc->recon_exec_timer);
|
|
RF_ETIMER_EVAL(reconDesc->recon_exec_timer);
|
|
reconDesc->reconExecTicks +=
|
|
RF_ETIMER_VAL_US(reconDesc->recon_exec_timer);
|
|
if (reconDesc->reconExecTicks > reconDesc->maxReconExecTicks)
|
|
reconDesc->maxReconExecTicks =
|
|
reconDesc->reconExecTicks;
|
|
if (reconDesc->reconExecTicks >= MAX_RECON_EXEC_USECS) {
|
|
/* we've been running too long. delay for
|
|
* RECON_DELAY_MS */
|
|
#if RF_RECON_STATS > 0
|
|
reconDesc->numReconExecDelays++;
|
|
#endif /* RF_RECON_STATS > 0 */
|
|
|
|
status = ltsleep(&reconDesc->reconExecTicks, PRIBIO,
|
|
"recon delay", RECON_TIMO,
|
|
&rctrl->eq_mutex);
|
|
RF_ASSERT(status == EWOULDBLOCK);
|
|
reconDesc->reconExecTicks = 0;
|
|
}
|
|
}
|
|
|
|
stall_count = 0;
|
|
while (!rctrl->eventQueue) {
|
|
#if RF_RECON_STATS > 0
|
|
reconDesc->numReconEventWaits++;
|
|
#endif /* RF_RECON_STATS > 0 */
|
|
|
|
ltsleep(&(rctrl)->eventQueue, PRIBIO, "raidframe eventq",
|
|
RF_EVENTQ_WAIT, &((rctrl)->eq_mutex));
|
|
|
|
stall_count++;
|
|
|
|
if ((stall_count > 10) &&
|
|
rctrl->headSepCBList) {
|
|
/* There is work to do on the callback list, and
|
|
we've waited long enough... */
|
|
rf_WakeupHeadSepCBWaiters(raidPtr);
|
|
stall_count = 0;
|
|
}
|
|
reconDesc->reconExecTicks = 0; /* we've just waited */
|
|
}
|
|
|
|
reconDesc->reconExecTimerRunning = 1;
|
|
if (RF_ETIMER_VAL_US(reconDesc->recon_exec_timer)!=0) {
|
|
/* it moved!! reset the timer. */
|
|
RF_ETIMER_START(reconDesc->recon_exec_timer);
|
|
}
|
|
event = rctrl->eventQueue;
|
|
rctrl->eventQueue = event->next;
|
|
event->next = NULL;
|
|
rctrl->eq_count--;
|
|
|
|
/* q null and count==0 must be equivalent conditions */
|
|
RF_ASSERT((rctrl->eventQueue == NULL) == (rctrl->eq_count == 0));
|
|
RF_UNLOCK_MUTEX(rctrl->eq_mutex);
|
|
return (event);
|
|
}
|
|
/* enqueues a reconstruction event on the indicated queue */
|
|
void
|
|
rf_CauseReconEvent(RF_Raid_t *raidPtr, RF_RowCol_t col, void *arg,
|
|
RF_Revent_t type)
|
|
{
|
|
RF_ReconCtrl_t *rctrl = raidPtr->reconControl;
|
|
RF_ReconEvent_t *event = GetReconEventDesc(col, arg, type);
|
|
|
|
if (type == RF_REVENT_BUFCLEAR) {
|
|
RF_ASSERT(col != rctrl->fcol);
|
|
}
|
|
RF_ASSERT(col >= 0 && col <= raidPtr->numCol);
|
|
RF_LOCK_MUTEX(rctrl->eq_mutex);
|
|
/* q null and count==0 must be equivalent conditions */
|
|
RF_ASSERT((rctrl->eventQueue == NULL) == (rctrl->eq_count == 0));
|
|
event->next = rctrl->eventQueue;
|
|
rctrl->eventQueue = event;
|
|
rctrl->eq_count++;
|
|
RF_UNLOCK_MUTEX(rctrl->eq_mutex);
|
|
|
|
wakeup(&(rctrl)->eventQueue);
|
|
}
|
|
/* allocates and initializes a recon event descriptor */
|
|
static RF_ReconEvent_t *
|
|
GetReconEventDesc(RF_RowCol_t col, void *arg, RF_Revent_t type)
|
|
{
|
|
RF_ReconEvent_t *t;
|
|
|
|
t = pool_get(&rf_pools.revent, PR_WAITOK);
|
|
t->col = col;
|
|
t->arg = arg;
|
|
t->type = type;
|
|
t->next = NULL;
|
|
return (t);
|
|
}
|
|
|
|
/*
|
|
rf_DrainReconEventQueue() -- used in the event of a reconstruction
|
|
problem, this function simply drains all pending events from the
|
|
reconstruct event queue.
|
|
*/
|
|
|
|
void
|
|
rf_DrainReconEventQueue(RF_RaidReconDesc_t *reconDesc)
|
|
{
|
|
RF_ReconCtrl_t *rctrl = reconDesc->raidPtr->reconControl;
|
|
RF_ReconEvent_t *event;
|
|
|
|
RF_LOCK_MUTEX(rctrl->eq_mutex);
|
|
while (rctrl->eventQueue!=NULL) {
|
|
|
|
event = rctrl->eventQueue;
|
|
rctrl->eventQueue = event->next;
|
|
event->next = NULL;
|
|
rctrl->eq_count--;
|
|
/* dump it */
|
|
rf_FreeReconEventDesc(event);
|
|
}
|
|
RF_UNLOCK_MUTEX(rctrl->eq_mutex);
|
|
}
|
|
|
|
void
|
|
rf_FreeReconEventDesc(RF_ReconEvent_t *event)
|
|
{
|
|
pool_put(&rf_pools.revent, event);
|
|
}
|