On an idea from Thor (tls@), do not fail a component if doing so would

render the RAID set completely dead.  Instead, we retry the IO a
maximum of RF_RETRY_THRESHOLD times (currently '5'), and then just
return an IO error if the IO fails.  This should reduce the damage
caused by having multiple disks appear to fail when the culprit is
really something else (power, controllers, etc.)
This commit is contained in:
oster 2004-11-16 16:45:51 +00:00
parent 297b6b1873
commit d7e754c41d
5 changed files with 35 additions and 14 deletions

View File

@ -1,4 +1,4 @@
/* $NetBSD: rf_desc.h,v 1.14 2004/06/02 22:58:28 drochner Exp $ */
/* $NetBSD: rf_desc.h,v 1.15 2004/11/16 16:45:51 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
@ -78,6 +78,7 @@ struct RF_RaidAccessDesc_s {
* RAID operation has gotten */
const RF_AccessState_t *states; /* array of states to be run */
int status; /* pass/fail status of the last operation */
int numRetries; /* number of times this IO has been attempted */
RF_DagList_t *dagList; /* list of dag lists, one list per stripe */
RF_VoidPointerListElem_t *iobufs; /* iobufs that need to be cleaned
up at the end of this IO */

View File

@ -1,4 +1,4 @@
/* $NetBSD: rf_driver.c,v 1.104 2004/06/29 17:09:01 oster Exp $ */
/* $NetBSD: rf_driver.c,v 1.105 2004/11/16 16:45:51 oster Exp $ */
/*-
* Copyright (c) 1999 The NetBSD Foundation, Inc.
* All rights reserved.
@ -73,7 +73,7 @@
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rf_driver.c,v 1.104 2004/06/29 17:09:01 oster Exp $");
__KERNEL_RCSID(0, "$NetBSD: rf_driver.c,v 1.105 2004/11/16 16:45:51 oster Exp $");
#include "opt_raid_diagnostic.h"
@ -573,6 +573,7 @@ rf_AllocRaidAccDesc(RF_Raid_t *raidPtr, RF_IoType_t type,
desc->dagList = NULL;
desc->status = 0;
desc->numRetries = 0;
#if RF_ACC_TRACE > 0
memset((char *) &desc->tracerec, 0, sizeof(RF_AccTraceEntry_t));
#endif

View File

@ -1,4 +1,4 @@
/* $NetBSD: rf_driver.h,v 1.12 2004/06/02 22:58:30 drochner Exp $ */
/* $NetBSD: rf_driver.h,v 1.13 2004/11/16 16:45:52 oster Exp $ */
/*
* rf_driver.h
*/
@ -37,6 +37,10 @@
#include "rf_threadstuff.h"
#include "rf_netbsd.h"
#ifndef RF_RETRY_THRESHOLD
#define RF_RETRY_THRESHOLD 5
#endif
RF_DECLARE_EXTERN_MUTEX(rf_printf_mutex)
int rf_BootRaidframe(void);
int rf_UnbootRaidframe(void);

View File

@ -1,4 +1,4 @@
/* $NetBSD: rf_netbsdkintf.c,v 1.182 2004/10/28 07:07:44 yamt Exp $ */
/* $NetBSD: rf_netbsdkintf.c,v 1.183 2004/11/16 16:45:51 oster Exp $ */
/*-
* Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
* All rights reserved.
@ -146,7 +146,7 @@
***********************************************************/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.182 2004/10/28 07:07:44 yamt Exp $");
__KERNEL_RCSID(0, "$NetBSD: rf_netbsdkintf.c,v 1.183 2004/11/16 16:45:51 oster Exp $");
#include <sys/param.h>
#include <sys/errno.h>
@ -1947,8 +1947,11 @@ KernelWakeupFunc(struct buf *vbp)
if (bp->b_flags & B_ERROR) {
/* Mark the disk as dead */
/* but only mark it once... */
if (queue->raidPtr->Disks[queue->col].status ==
rf_ds_optimal) {
/* and only if it wouldn't leave this RAID set
completely broken */
if ((queue->raidPtr->Disks[queue->col].status ==
rf_ds_optimal) && (queue->raidPtr->numFailures <
queue->raidPtr->Layout.map->faultsTolerated)) {
printf("raid%d: IO Error. Marking %s as failed.\n",
queue->raidPtr->raidid,
queue->raidPtr->Disks[queue->col].devname);

View File

@ -1,4 +1,4 @@
/* $NetBSD: rf_states.c,v 1.35 2004/03/23 13:09:18 oster Exp $ */
/* $NetBSD: rf_states.c,v 1.36 2004/11/16 16:45:52 oster Exp $ */
/*
* Copyright (c) 1995 Carnegie-Mellon University.
* All rights reserved.
@ -27,7 +27,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rf_states.c,v 1.35 2004/03/23 13:09:18 oster Exp $");
__KERNEL_RCSID(0, "$NetBSD: rf_states.c,v 1.36 2004/11/16 16:45:52 oster Exp $");
#include <sys/errno.h>
@ -512,13 +512,18 @@ rf_State_CreateDAG(RF_RaidAccessDesc_t *desc)
desc->status = 0; /* good status */
if (selectStatus) {
if (selectStatus || (desc->numRetries > RF_RETRY_THRESHOLD)) {
/* failed to create a dag */
/* this happens when there are too many faults or incomplete
* dag libraries */
printf("raid%d: failed to create a dag. "
"Too many component failures.\n",
desc->raidPtr->raidid);
if (selectStatus) {
printf("raid%d: failed to create a dag. "
"Too many component failures.\n",
desc->raidPtr->raidid);
} else {
printf("raid%d: IO failed after %d retries.\n",
desc->raidPtr->raidid, RF_RETRY_THRESHOLD);
}
desc->status = 1; /* bad status */
/* skip straight to rf_State_Cleanup() */
@ -624,6 +629,13 @@ rf_State_ProcessDAG(RF_RaidAccessDesc_t *desc)
rf_FreeDAGList(temp);
}
rf_MarkFailuresInASMList(raidPtr, asmh);
/* note the retry so that we'll bail in
rf_State_CreateDAG() once we've retired
the IO RF_RETRY_THRESHOLD times */
desc->numRetries++;
/* back up to rf_State_CreateDAG */
desc->state = desc->state - 2;
return RF_FALSE;