38a3987b69
Carnegie Mellon University. Full RAID implementation, including levels 0, 1, 4, 5, 6, parity logging, and a few other goodies. Ported to NetBSD by Greg Oster.
881 lines
28 KiB
C
881 lines
28 KiB
C
/* $NetBSD: rf_raid1.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */
|
|
/*
|
|
* Copyright (c) 1995 Carnegie-Mellon University.
|
|
* All rights reserved.
|
|
*
|
|
* Author: William V. Courtright II
|
|
*
|
|
* Permission to use, copy, modify and distribute this software and
|
|
* its documentation is hereby granted, provided that both the copyright
|
|
* notice and this permission notice appear in all copies of the
|
|
* software, derivative works or modified versions, and any portions
|
|
* thereof, and that both notices appear in supporting documentation.
|
|
*
|
|
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
|
|
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
|
|
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
|
|
*
|
|
* Carnegie Mellon requests users of this software to return to
|
|
*
|
|
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
|
|
* School of Computer Science
|
|
* Carnegie Mellon University
|
|
* Pittsburgh PA 15213-3890
|
|
*
|
|
* any improvements or extensions that they make and grant Carnegie the
|
|
* rights to redistribute these changes.
|
|
*/
|
|
|
|
/*****************************************************************************
|
|
*
|
|
* rf_raid1.c -- implements RAID Level 1
|
|
*
|
|
*****************************************************************************/
|
|
|
|
/*
|
|
* :
|
|
* Log: rf_raid1.c,v
|
|
* Revision 1.46 1996/11/05 21:10:40 jimz
|
|
* failed pda generalization
|
|
*
|
|
* Revision 1.45 1996/07/31 16:56:18 jimz
|
|
* dataBytesPerStripe, sectorsPerDisk init arch-indep.
|
|
*
|
|
* Revision 1.44 1996/07/30 03:06:43 jimz
|
|
* get rid of extra rf_threadid.h include
|
|
*
|
|
* Revision 1.43 1996/07/27 23:36:08 jimz
|
|
* Solaris port of simulator
|
|
*
|
|
* Revision 1.42 1996/07/22 19:52:16 jimz
|
|
* switched node params to RF_DagParam_t, a union of
|
|
* a 64-bit int and a void *, for better portability
|
|
* attempted hpux port, but failed partway through for
|
|
* lack of a single C compiler capable of compiling all
|
|
* source files
|
|
*
|
|
* Revision 1.41 1996/07/18 22:57:14 jimz
|
|
* port simulator to AIX
|
|
*
|
|
* Revision 1.40 1996/07/17 14:31:19 jimz
|
|
* minor cleanup for readability
|
|
*
|
|
* Revision 1.39 1996/07/15 17:22:18 jimz
|
|
* nit-pick code cleanup
|
|
* resolve stdlib problems on DEC OSF
|
|
*
|
|
* Revision 1.38 1996/07/15 02:56:31 jimz
|
|
* fixed dag selection to deal with failed + recon to spare disks
|
|
* enhanced recon, parity check debugging
|
|
*
|
|
* Revision 1.37 1996/07/13 00:00:59 jimz
|
|
* sanitized generalized reconstruction architecture
|
|
* cleaned up head sep, rbuf problems
|
|
*
|
|
* Revision 1.36 1996/07/11 19:08:00 jimz
|
|
* generalize reconstruction mechanism
|
|
* allow raid1 reconstructs via copyback (done with array
|
|
* quiesced, not online, therefore not disk-directed)
|
|
*
|
|
* Revision 1.35 1996/07/10 23:01:24 jimz
|
|
* Better commenting of VerifyParity (for posterity)
|
|
*
|
|
* Revision 1.34 1996/07/10 22:29:45 jimz
|
|
* VerifyParityRAID1: corrected return values for stripes in degraded mode
|
|
*
|
|
* Revision 1.33 1996/07/10 16:05:39 jimz
|
|
* fixed a couple minor bugs in VerifyParityRAID1
|
|
* added code to correct bad RAID1 parity
|
|
*
|
|
* Revision 1.32 1996/06/20 18:47:04 jimz
|
|
* fix up verification bugs
|
|
*
|
|
* Revision 1.31 1996/06/20 15:38:59 jimz
|
|
* added parity verification
|
|
* can't correct bad parity yet, but can return pass/fail
|
|
*
|
|
* Revision 1.30 1996/06/19 22:23:01 jimz
|
|
* parity verification is now a layout-configurable thing
|
|
* not all layouts currently support it (correctly, anyway)
|
|
*
|
|
* Revision 1.29 1996/06/11 08:54:27 jimz
|
|
* improved error-checking at configuration time
|
|
*
|
|
* Revision 1.28 1996/06/10 18:25:24 wvcii
|
|
* fixed bug in rf_IdentifyStripeRAID1 - added array initialization
|
|
*
|
|
* Revision 1.27 1996/06/10 11:55:47 jimz
|
|
* Straightened out some per-array/not-per-array distinctions, fixed
|
|
* a couple bugs related to confusion. Added shutdown lists. Removed
|
|
* layout shutdown function (now subsumed by shutdown lists).
|
|
*
|
|
* Revision 1.26 1996/06/07 22:26:27 jimz
|
|
* type-ify which_ru (RF_ReconUnitNum_t)
|
|
*
|
|
* Revision 1.25 1996/06/07 21:33:04 jimz
|
|
* begin using consistent types for sector numbers,
|
|
* stripe numbers, row+col numbers, recon unit numbers
|
|
*
|
|
* Revision 1.24 1996/06/06 17:29:43 jimz
|
|
* use CreateMirrorIdleReadDAG for mirrored read
|
|
*
|
|
* Revision 1.23 1996/06/03 23:28:26 jimz
|
|
* more bugfixes
|
|
* check in tree to sync for IPDS runs with current bugfixes
|
|
* there still may be a problem with threads in the script test
|
|
* getting I/Os stuck- not trivially reproducible (runs ~50 times
|
|
* in a row without getting stuck)
|
|
*
|
|
* Revision 1.22 1996/06/02 17:31:48 jimz
|
|
* Moved a lot of global stuff into array structure, where it belongs.
|
|
* Fixed up paritylogging, pss modules in this manner. Some general
|
|
* code cleanup. Removed lots of dead code, some dead files.
|
|
*
|
|
* Revision 1.21 1996/05/31 22:26:54 jimz
|
|
* fix a lot of mapping problems, memory allocation problems
|
|
* found some weird lock issues, fixed 'em
|
|
* more code cleanup
|
|
*
|
|
* Revision 1.20 1996/05/30 23:22:16 jimz
|
|
* bugfixes of serialization, timing problems
|
|
* more cleanup
|
|
*
|
|
* Revision 1.19 1996/05/30 11:29:41 jimz
|
|
* Numerous bug fixes. Stripe lock release code disagreed with the taking code
|
|
* about when stripes should be locked (I made it consistent: no parity, no lock)
|
|
* There was a lot of extra serialization of I/Os which I've removed- a lot of
|
|
* it was to calculate values for the cache code, which is no longer with us.
|
|
* More types, function, macro cleanup. Added code to properly quiesce the array
|
|
* on shutdown. Made a lot of stuff array-specific which was (bogusly) general
|
|
* before. Fixed memory allocation, freeing bugs.
|
|
*
|
|
* Revision 1.18 1996/05/27 18:56:37 jimz
|
|
* more code cleanup
|
|
* better typing
|
|
* compiles in all 3 environments
|
|
*
|
|
* Revision 1.17 1996/05/24 22:17:04 jimz
|
|
* continue code + namespace cleanup
|
|
* typed a bunch of flags
|
|
*
|
|
* Revision 1.16 1996/05/24 04:28:55 jimz
|
|
* release cleanup ckpt
|
|
*
|
|
* Revision 1.15 1996/05/24 01:59:45 jimz
|
|
* another checkpoint in code cleanup for release
|
|
* time to sync kernel tree
|
|
*
|
|
* Revision 1.14 1996/05/18 19:51:34 jimz
|
|
* major code cleanup- fix syntax, make some types consistent,
|
|
* add prototypes, clean out dead code, et cetera
|
|
*
|
|
* Revision 1.13 1996/05/03 19:36:22 wvcii
|
|
* moved dag creation routines to dag library
|
|
*
|
|
* Revision 1.12 1996/02/23 01:38:16 amiri
|
|
* removed chained declustering special case in SelectIdleDisk
|
|
*
|
|
* Revision 1.11 1996/02/22 16:47:18 amiri
|
|
* disabled shortest queue optimization for chained declustering
|
|
*
|
|
* Revision 1.10 1995/12/12 18:10:06 jimz
|
|
* MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
|
|
* fix 80-column brain damage in comments
|
|
*
|
|
* Revision 1.9 1995/12/04 19:21:28 wvcii
|
|
* modified SelectIdleDisk to take a mirror node as a parameter and
|
|
* conditionally swap params 0 (data pda) and 4 (mirror pda).
|
|
* modified CreateRaidOneReadDAG so that it creates the DAG itself
|
|
* as opposed to reusing code in CreateNonredundantDAG.
|
|
*
|
|
* Revision 1.8 1995/11/30 16:07:45 wvcii
|
|
* added copyright info
|
|
*
|
|
* Revision 1.7 1995/11/16 14:46:18 wvcii
|
|
* fixed bugs in mapping and degraded dag creation, added comments
|
|
*
|
|
* Revision 1.6 1995/11/14 22:29:16 wvcii
|
|
* fixed bugs in dag creation
|
|
*
|
|
* Revision 1.5 1995/11/07 15:23:33 wvcii
|
|
* changed RAID1DagSelect prototype
|
|
* function no longer generates numHdrSucc, numTermAnt
|
|
* changed dag creation routines:
|
|
* term node generated during dag creation
|
|
* encoded commit nodes, barrier, antecedent types
|
|
*
|
|
* Revision 1.4 1995/10/10 19:09:21 wvcii
|
|
* write dag now handles non-aligned accesses
|
|
*
|
|
* Revision 1.3 1995/10/05 02:32:56 jimz
|
|
* ifdef'd out queue locking for load balancing
|
|
*
|
|
* Revision 1.2 1995/10/04 07:04:40 wvcii
|
|
* reads are now scheduled according to disk queue length.
|
|
* queue length is the sum of number of ios queued in raidframe as well as those at the disk.
|
|
* reads are sent to the disk with the shortest queue.
|
|
* testing against user disks successful, sim & kernel untested.
|
|
*
|
|
* Revision 1.1 1995/10/04 03:53:23 wvcii
|
|
* Initial revision
|
|
*
|
|
*
|
|
*/
|
|
|
|
#include "rf_raid.h"
|
|
#include "rf_raid1.h"
|
|
#include "rf_dag.h"
|
|
#include "rf_dagffrd.h"
|
|
#include "rf_dagffwr.h"
|
|
#include "rf_dagdegrd.h"
|
|
#include "rf_dagutils.h"
|
|
#include "rf_dagfuncs.h"
|
|
#include "rf_threadid.h"
|
|
#include "rf_diskqueue.h"
|
|
#include "rf_general.h"
|
|
#include "rf_utils.h"
|
|
#include "rf_parityscan.h"
|
|
#include "rf_mcpair.h"
|
|
#include "rf_layout.h"
|
|
#include "rf_map.h"
|
|
#include "rf_engine.h"
|
|
#include "rf_reconbuffer.h"
|
|
#include "rf_sys.h"
|
|
|
|
typedef struct RF_Raid1ConfigInfo_s {
|
|
RF_RowCol_t **stripeIdentifier;
|
|
} RF_Raid1ConfigInfo_t;
|
|
|
|
/* start of day code specific to RAID level 1 */
|
|
int rf_ConfigureRAID1(
|
|
RF_ShutdownList_t **listp,
|
|
RF_Raid_t *raidPtr,
|
|
RF_Config_t *cfgPtr)
|
|
{
|
|
RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
|
|
RF_Raid1ConfigInfo_t *info;
|
|
RF_RowCol_t i;
|
|
|
|
/* create a RAID level 1 configuration structure */
|
|
RF_MallocAndAdd(info, sizeof(RF_Raid1ConfigInfo_t), (RF_Raid1ConfigInfo_t *), raidPtr->cleanupList);
|
|
if (info == NULL)
|
|
return(ENOMEM);
|
|
layoutPtr->layoutSpecificInfo = (void *) info;
|
|
|
|
/* ... and fill it in. */
|
|
info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol / 2, 2, raidPtr->cleanupList);
|
|
if (info->stripeIdentifier == NULL)
|
|
return(ENOMEM);
|
|
for (i = 0; i < (raidPtr->numCol / 2); i ++) {
|
|
info->stripeIdentifier[i][0] = (2 * i);
|
|
info->stripeIdentifier[i][1] = (2 * i) + 1;
|
|
}
|
|
|
|
RF_ASSERT(raidPtr->numRow == 1);
|
|
|
|
/* this implementation of RAID level 1 uses one row of numCol disks and allows multiple (numCol / 2)
|
|
* stripes per row. A stripe consists of a single data unit and a single parity (mirror) unit.
|
|
* stripe id = raidAddr / stripeUnitSize
|
|
*/
|
|
raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * (raidPtr->numCol / 2) * layoutPtr->sectorsPerStripeUnit;
|
|
layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk * (raidPtr->numCol / 2);
|
|
layoutPtr->dataSectorsPerStripe = layoutPtr->sectorsPerStripeUnit;
|
|
layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
|
|
layoutPtr->numDataCol = 1;
|
|
layoutPtr->numParityCol = 1;
|
|
return(0);
|
|
}
|
|
|
|
|
|
/* returns the physical disk location of the primary copy in the mirror pair */
|
|
void rf_MapSectorRAID1(
|
|
RF_Raid_t *raidPtr,
|
|
RF_RaidAddr_t raidSector,
|
|
RF_RowCol_t *row,
|
|
RF_RowCol_t *col,
|
|
RF_SectorNum_t *diskSector,
|
|
int remap)
|
|
{
|
|
RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
|
|
RF_RowCol_t mirrorPair = SUID % (raidPtr->numCol / 2);
|
|
|
|
*row = 0;
|
|
*col = 2 * mirrorPair;
|
|
*diskSector = ((SUID / (raidPtr->numCol / 2)) * raidPtr->Layout.sectorsPerStripeUnit) + (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
|
|
}
|
|
|
|
|
|
/* Map Parity
|
|
*
|
|
* returns the physical disk location of the secondary copy in the mirror
|
|
* pair
|
|
*/
|
|
void rf_MapParityRAID1(
|
|
RF_Raid_t *raidPtr,
|
|
RF_RaidAddr_t raidSector,
|
|
RF_RowCol_t *row,
|
|
RF_RowCol_t *col,
|
|
RF_SectorNum_t *diskSector,
|
|
int remap)
|
|
{
|
|
RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
|
|
RF_RowCol_t mirrorPair = SUID % (raidPtr->numCol / 2);
|
|
|
|
*row = 0;
|
|
*col = (2 * mirrorPair) + 1;
|
|
|
|
*diskSector = ((SUID / (raidPtr->numCol / 2)) * raidPtr->Layout.sectorsPerStripeUnit) + (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
|
|
}
|
|
|
|
|
|
/* IdentifyStripeRAID1
|
|
*
|
|
* returns a list of disks for a given redundancy group
|
|
*/
|
|
void rf_IdentifyStripeRAID1(
|
|
RF_Raid_t *raidPtr,
|
|
RF_RaidAddr_t addr,
|
|
RF_RowCol_t **diskids,
|
|
RF_RowCol_t *outRow)
|
|
{
|
|
RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
|
|
RF_Raid1ConfigInfo_t *info = raidPtr->Layout.layoutSpecificInfo;
|
|
RF_ASSERT(stripeID >= 0);
|
|
RF_ASSERT(addr >= 0);
|
|
*outRow = 0;
|
|
*diskids = info->stripeIdentifier[ stripeID % (raidPtr->numCol/2)];
|
|
RF_ASSERT(*diskids);
|
|
}
|
|
|
|
|
|
/* MapSIDToPSIDRAID1
|
|
*
|
|
* maps a logical stripe to a stripe in the redundant array
|
|
*/
|
|
void rf_MapSIDToPSIDRAID1(
|
|
RF_RaidLayout_t *layoutPtr,
|
|
RF_StripeNum_t stripeID,
|
|
RF_StripeNum_t *psID,
|
|
RF_ReconUnitNum_t *which_ru)
|
|
{
|
|
*which_ru = 0;
|
|
*psID = stripeID;
|
|
}
|
|
|
|
|
|
|
|
/******************************************************************************
|
|
* select a graph to perform a single-stripe access
|
|
*
|
|
* Parameters: raidPtr - description of the physical array
|
|
* type - type of operation (read or write) requested
|
|
* asmap - logical & physical addresses for this access
|
|
* createFunc - name of function to use to create the graph
|
|
*****************************************************************************/
|
|
|
|
void rf_RAID1DagSelect(
|
|
RF_Raid_t *raidPtr,
|
|
RF_IoType_t type,
|
|
RF_AccessStripeMap_t *asmap,
|
|
RF_VoidFuncPtr *createFunc)
|
|
{
|
|
RF_RowCol_t frow, fcol, or, oc;
|
|
RF_PhysDiskAddr_t *failedPDA;
|
|
int prior_recon, tid;
|
|
RF_RowStatus_t rstat;
|
|
RF_SectorNum_t oo;
|
|
|
|
|
|
RF_ASSERT(RF_IO_IS_R_OR_W(type));
|
|
|
|
if (asmap->numDataFailed + asmap->numParityFailed > 1) {
|
|
RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
|
|
*createFunc = NULL;
|
|
return;
|
|
}
|
|
|
|
if (asmap->numDataFailed + asmap->numParityFailed) {
|
|
/*
|
|
* We've got a fault. Re-map to spare space, iff applicable.
|
|
* Shouldn't the arch-independent code do this for us?
|
|
* Anyway, it turns out if we don't do this here, then when
|
|
* we're reconstructing, writes go only to the surviving
|
|
* original disk, and aren't reflected on the reconstructed
|
|
* spare. Oops. --jimz
|
|
*/
|
|
failedPDA = asmap->failedPDAs[0];
|
|
frow = failedPDA->row;
|
|
fcol = failedPDA->col;
|
|
rstat = raidPtr->status[frow];
|
|
prior_recon = (rstat == rf_rs_reconfigured) || (
|
|
(rstat == rf_rs_reconstructing) ?
|
|
rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
|
|
);
|
|
if (prior_recon) {
|
|
or = frow;
|
|
oc = fcol;
|
|
oo = failedPDA->startSector;
|
|
/*
|
|
* If we did distributed sparing, we'd monkey with that here.
|
|
* But we don't, so we'll
|
|
*/
|
|
failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
|
|
failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
|
|
/*
|
|
* Redirect other components, iff necessary. This looks
|
|
* pretty suspicious to me, but it's what the raid5
|
|
* DAG select does.
|
|
*/
|
|
if (asmap->parityInfo->next) {
|
|
if (failedPDA == asmap->parityInfo) {
|
|
failedPDA->next->row = failedPDA->row;
|
|
failedPDA->next->col = failedPDA->col;
|
|
}
|
|
else {
|
|
if (failedPDA == asmap->parityInfo->next) {
|
|
asmap->parityInfo->row = failedPDA->row;
|
|
asmap->parityInfo->col = failedPDA->col;
|
|
}
|
|
}
|
|
}
|
|
if (rf_dagDebug || rf_mapDebug) {
|
|
rf_get_threadid(tid);
|
|
printf("[%d] Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
|
|
tid, type, or, oc, (long)oo, failedPDA->row, failedPDA->col,
|
|
(long)failedPDA->startSector);
|
|
}
|
|
asmap->numDataFailed = asmap->numParityFailed = 0;
|
|
}
|
|
}
|
|
if (type == RF_IO_TYPE_READ) {
|
|
if (asmap->numDataFailed == 0)
|
|
*createFunc = (RF_VoidFuncPtr)rf_CreateMirrorIdleReadDAG;
|
|
else
|
|
*createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneDegradedReadDAG;
|
|
}
|
|
else {
|
|
*createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneWriteDAG;
|
|
}
|
|
}
|
|
|
|
int rf_VerifyParityRAID1(
|
|
RF_Raid_t *raidPtr,
|
|
RF_RaidAddr_t raidAddr,
|
|
RF_PhysDiskAddr_t *parityPDA,
|
|
int correct_it,
|
|
RF_RaidAccessFlags_t flags)
|
|
{
|
|
int nbytes, bcount, stripeWidth, ret, i, j, tid=0, nbad, *bbufs;
|
|
RF_DagNode_t *blockNode, *unblockNode, *wrBlock;
|
|
RF_DagHeader_t *rd_dag_h, *wr_dag_h;
|
|
RF_AccessStripeMapHeader_t *asm_h;
|
|
RF_AllocListElem_t *allocList;
|
|
RF_AccTraceEntry_t tracerec;
|
|
RF_ReconUnitNum_t which_ru;
|
|
RF_RaidLayout_t *layoutPtr;
|
|
RF_AccessStripeMap_t *aasm;
|
|
RF_SectorCount_t nsector;
|
|
RF_RaidAddr_t startAddr;
|
|
char *buf, *buf1, *buf2;
|
|
RF_PhysDiskAddr_t *pda;
|
|
RF_StripeNum_t psID;
|
|
RF_MCPair_t *mcpair;
|
|
|
|
if (rf_verifyParityDebug) {
|
|
rf_get_threadid(tid);
|
|
}
|
|
|
|
layoutPtr = &raidPtr->Layout;
|
|
startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
|
|
nsector = parityPDA->numSector;
|
|
nbytes = rf_RaidAddressToByte(raidPtr, nsector);
|
|
psID = rf_RaidAddressToParityStripeID(layoutPtr, raidAddr, &which_ru);
|
|
|
|
asm_h = NULL;
|
|
rd_dag_h = wr_dag_h = NULL;
|
|
mcpair = NULL;
|
|
|
|
ret = RF_PARITY_COULD_NOT_VERIFY;
|
|
|
|
rf_MakeAllocList(allocList);
|
|
if (allocList == NULL)
|
|
return(RF_PARITY_COULD_NOT_VERIFY);
|
|
mcpair = rf_AllocMCPair();
|
|
if (mcpair == NULL)
|
|
goto done;
|
|
RF_ASSERT(layoutPtr->numDataCol == layoutPtr->numParityCol);
|
|
stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
|
|
bcount = nbytes*(layoutPtr->numDataCol + layoutPtr->numParityCol);
|
|
RF_MallocAndAdd(buf, bcount, (char *), allocList);
|
|
if (buf == NULL)
|
|
goto done;
|
|
if (rf_verifyParityDebug) {
|
|
printf("[%d] RAID1 parity verify: buf=%lx bcount=%d (%lx - %lx)\n",
|
|
tid, (long)buf, bcount, (long)buf, (long)buf+bcount);
|
|
}
|
|
|
|
/*
|
|
* Generate a DAG which will read the entire stripe- then we can
|
|
* just compare data chunks versus "parity" chunks.
|
|
*/
|
|
|
|
rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, nbytes, buf,
|
|
rf_DiskReadFunc, rf_DiskReadUndoFunc, "Rod", allocList, flags,
|
|
RF_IO_NORMAL_PRIORITY);
|
|
if (rd_dag_h == NULL)
|
|
goto done;
|
|
blockNode = rd_dag_h->succedents[0];
|
|
unblockNode = blockNode->succedents[0]->succedents[0];
|
|
|
|
/*
|
|
* Map the access to physical disk addresses (PDAs)- this will
|
|
* get us both a list of data addresses, and "parity" addresses
|
|
* (which are really mirror copies).
|
|
*/
|
|
asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe,
|
|
buf, RF_DONT_REMAP);
|
|
aasm = asm_h->stripeMap;
|
|
|
|
buf1 = buf;
|
|
/*
|
|
* Loop through the data blocks, setting up read nodes for each.
|
|
*/
|
|
for(pda=aasm->physInfo,i=0;i<layoutPtr->numDataCol;i++,pda=pda->next)
|
|
{
|
|
RF_ASSERT(pda);
|
|
|
|
rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
|
|
|
|
RF_ASSERT(pda->numSector != 0);
|
|
if (rf_TryToRedirectPDA(raidPtr, pda, 0)) {
|
|
/* cannot verify parity with dead disk */
|
|
goto done;
|
|
}
|
|
pda->bufPtr = buf1;
|
|
blockNode->succedents[i]->params[0].p = pda;
|
|
blockNode->succedents[i]->params[1].p = buf1;
|
|
blockNode->succedents[i]->params[2].v = psID;
|
|
blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
|
|
buf1 += nbytes;
|
|
}
|
|
RF_ASSERT(pda == NULL);
|
|
/*
|
|
* keep i, buf1 running
|
|
*
|
|
* Loop through parity blocks, setting up read nodes for each.
|
|
*/
|
|
for(pda=aasm->parityInfo;i<layoutPtr->numDataCol+layoutPtr->numParityCol;i++,pda=pda->next)
|
|
{
|
|
RF_ASSERT(pda);
|
|
rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
|
|
RF_ASSERT(pda->numSector != 0);
|
|
if (rf_TryToRedirectPDA(raidPtr, pda, 0)) {
|
|
/* cannot verify parity with dead disk */
|
|
goto done;
|
|
}
|
|
pda->bufPtr = buf1;
|
|
blockNode->succedents[i]->params[0].p = pda;
|
|
blockNode->succedents[i]->params[1].p = buf1;
|
|
blockNode->succedents[i]->params[2].v = psID;
|
|
blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
|
|
buf1 += nbytes;
|
|
}
|
|
RF_ASSERT(pda == NULL);
|
|
|
|
bzero((char *)&tracerec, sizeof(tracerec));
|
|
rd_dag_h->tracerec = &tracerec;
|
|
|
|
if (rf_verifyParityDebug > 1) {
|
|
printf("[%d] RAID1 parity verify read dag:\n", tid);
|
|
rf_PrintDAGList(rd_dag_h);
|
|
}
|
|
|
|
RF_LOCK_MUTEX(mcpair->mutex);
|
|
mcpair->flag = 0;
|
|
rf_DispatchDAG(rd_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
|
|
(void *)mcpair);
|
|
while (mcpair->flag == 0) {
|
|
RF_WAIT_MCPAIR(mcpair);
|
|
}
|
|
RF_UNLOCK_MUTEX(mcpair->mutex);
|
|
|
|
if (rd_dag_h->status != rf_enable) {
|
|
RF_ERRORMSG("Unable to verify raid1 parity: can't read stripe\n");
|
|
ret = RF_PARITY_COULD_NOT_VERIFY;
|
|
goto done;
|
|
}
|
|
|
|
/*
|
|
* buf1 is the beginning of the data blocks chunk
|
|
* buf2 is the beginning of the parity blocks chunk
|
|
*/
|
|
buf1 = buf;
|
|
buf2 = buf + (nbytes * layoutPtr->numDataCol);
|
|
ret = RF_PARITY_OKAY;
|
|
/*
|
|
* bbufs is "bad bufs"- an array whose entries are the data
|
|
* column numbers where we had miscompares. (That is, column 0
|
|
* and column 1 of the array are mirror copies, and are considered
|
|
* "data column 0" for this purpose).
|
|
*/
|
|
RF_MallocAndAdd(bbufs, layoutPtr->numParityCol*sizeof(int), (int *),
|
|
allocList);
|
|
nbad = 0;
|
|
/*
|
|
* Check data vs "parity" (mirror copy).
|
|
*/
|
|
for(i=0;i<layoutPtr->numDataCol;i++) {
|
|
if (rf_verifyParityDebug) {
|
|
printf("[%d] RAID1 parity verify %d bytes: i=%d buf1=%lx buf2=%lx buf=%lx\n",
|
|
tid, nbytes, i, (long)buf1, (long)buf2, (long)buf);
|
|
}
|
|
ret = bcmp(buf1, buf2, nbytes);
|
|
if (ret) {
|
|
if (rf_verifyParityDebug > 1) {
|
|
for(j=0;j<nbytes;j++) {
|
|
if (buf1[j] != buf2[j])
|
|
break;
|
|
}
|
|
printf("psid=%ld j=%d\n", (long)psID, j);
|
|
printf("buf1 %02x %02x %02x %02x %02x\n", buf1[0]&0xff,
|
|
buf1[1]&0xff, buf1[2]&0xff, buf1[3]&0xff, buf1[4]&0xff);
|
|
printf("buf2 %02x %02x %02x %02x %02x\n", buf2[0]&0xff,
|
|
buf2[1]&0xff, buf2[2]&0xff, buf2[3]&0xff, buf2[4]&0xff);
|
|
}
|
|
if (rf_verifyParityDebug) {
|
|
printf("[%d] RAID1: found bad parity, i=%d\n", tid, i);
|
|
}
|
|
/*
|
|
* Parity is bad. Keep track of which columns were bad.
|
|
*/
|
|
if (bbufs)
|
|
bbufs[nbad] = i;
|
|
nbad++;
|
|
ret = RF_PARITY_BAD;
|
|
}
|
|
buf1 += nbytes;
|
|
buf2 += nbytes;
|
|
}
|
|
|
|
if ((ret != RF_PARITY_OKAY) && correct_it) {
|
|
ret = RF_PARITY_COULD_NOT_CORRECT;
|
|
if (rf_verifyParityDebug) {
|
|
printf("[%d] RAID1 parity verify: parity not correct\n", tid);
|
|
}
|
|
if (bbufs == NULL)
|
|
goto done;
|
|
/*
|
|
* Make a DAG with one write node for each bad unit. We'll simply
|
|
* write the contents of the data unit onto the parity unit for
|
|
* correction. (It's possible that the mirror copy was the correct
|
|
* copy, and that we're spooging good data by writing bad over it,
|
|
* but there's no way we can know that.
|
|
*/
|
|
wr_dag_h = rf_MakeSimpleDAG(raidPtr, nbad, nbytes, buf,
|
|
rf_DiskWriteFunc, rf_DiskWriteUndoFunc, "Wnp", allocList, flags,
|
|
RF_IO_NORMAL_PRIORITY);
|
|
if (wr_dag_h == NULL)
|
|
goto done;
|
|
wrBlock = wr_dag_h->succedents[0];
|
|
/*
|
|
* Fill in a write node for each bad compare.
|
|
*/
|
|
for(i=0;i<nbad;i++) {
|
|
j = i+layoutPtr->numDataCol;
|
|
pda = blockNode->succedents[j]->params[0].p;
|
|
pda->bufPtr = blockNode->succedents[i]->params[1].p;
|
|
wrBlock->succedents[i]->params[0].p = pda;
|
|
wrBlock->succedents[i]->params[1].p = pda->bufPtr;
|
|
wrBlock->succedents[i]->params[2].v = psID;
|
|
wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
|
|
}
|
|
bzero((char *)&tracerec, sizeof(tracerec));
|
|
wr_dag_h->tracerec = &tracerec;
|
|
if (rf_verifyParityDebug > 1) {
|
|
printf("Parity verify write dag:\n");
|
|
rf_PrintDAGList(wr_dag_h);
|
|
}
|
|
RF_LOCK_MUTEX(mcpair->mutex);
|
|
mcpair->flag = 0;
|
|
/* fire off the write DAG */
|
|
rf_DispatchDAG(wr_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
|
|
(void *)mcpair);
|
|
while (!mcpair->flag) {
|
|
RF_WAIT_COND(mcpair->cond, mcpair->mutex);
|
|
}
|
|
RF_UNLOCK_MUTEX(mcpair->mutex);
|
|
if (wr_dag_h->status != rf_enable) {
|
|
RF_ERRORMSG("Unable to correct RAID1 parity in VerifyParity\n");
|
|
goto done;
|
|
}
|
|
ret = RF_PARITY_CORRECTED;
|
|
}
|
|
|
|
done:
|
|
/*
|
|
* All done. We might've gotten here without doing part of the function,
|
|
* so cleanup what we have to and return our running status.
|
|
*/
|
|
if (asm_h)
|
|
rf_FreeAccessStripeMap(asm_h);
|
|
if (rd_dag_h)
|
|
rf_FreeDAG(rd_dag_h);
|
|
if (wr_dag_h)
|
|
rf_FreeDAG(wr_dag_h);
|
|
if (mcpair)
|
|
rf_FreeMCPair(mcpair);
|
|
rf_FreeAllocList(allocList);
|
|
if (rf_verifyParityDebug) {
|
|
printf("[%d] RAID1 parity verify, returning %d\n", tid, ret);
|
|
}
|
|
return(ret);
|
|
}
|
|
|
|
int rf_SubmitReconBufferRAID1(rbuf, keep_it, use_committed)
|
|
RF_ReconBuffer_t *rbuf; /* the recon buffer to submit */
|
|
int keep_it; /* whether we can keep this buffer or we have to return it */
|
|
int use_committed; /* whether to use a committed or an available recon buffer */
|
|
{
|
|
RF_ReconParityStripeStatus_t *pssPtr;
|
|
RF_ReconCtrl_t *reconCtrlPtr;
|
|
RF_RaidLayout_t *layoutPtr;
|
|
int tid=0, retcode, created;
|
|
RF_CallbackDesc_t *cb, *p;
|
|
RF_ReconBuffer_t *t;
|
|
RF_Raid_t *raidPtr;
|
|
caddr_t ta;
|
|
|
|
retcode = 0;
|
|
created = 0;
|
|
|
|
raidPtr = rbuf->raidPtr;
|
|
layoutPtr = &raidPtr->Layout;
|
|
reconCtrlPtr = raidPtr->reconControl[rbuf->row];
|
|
|
|
RF_ASSERT(rbuf);
|
|
RF_ASSERT(rbuf->col != reconCtrlPtr->fcol);
|
|
|
|
if (rf_reconbufferDebug) {
|
|
rf_get_threadid(tid);
|
|
printf("[%d] RAID1 reconbuffer submission r%d c%d psid %ld ru%d (failed offset %ld)\n",
|
|
tid, rbuf->row, rbuf->col, (long)rbuf->parityStripeID, rbuf->which_ru,
|
|
(long)rbuf->failedDiskSectorOffset);
|
|
}
|
|
|
|
if (rf_reconDebug) {
|
|
printf("RAID1 reconbuffer submit psid %ld buf %lx\n",
|
|
(long)rbuf->parityStripeID, (long)rbuf->buffer);
|
|
printf("RAID1 psid %ld %02x %02x %02x %02x %02x\n",
|
|
(long)rbuf->parityStripeID,
|
|
rbuf->buffer[0], rbuf->buffer[1], rbuf->buffer[2], rbuf->buffer[3],
|
|
rbuf->buffer[4]);
|
|
}
|
|
|
|
RF_LOCK_PSS_MUTEX(raidPtr,rbuf->row,rbuf->parityStripeID);
|
|
|
|
RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
|
|
|
|
pssPtr = rf_LookupRUStatus(raidPtr, reconCtrlPtr->pssTable,
|
|
rbuf->parityStripeID, rbuf->which_ru, RF_PSS_NONE, &created);
|
|
RF_ASSERT(pssPtr); /* if it didn't exist, we wouldn't have gotten an rbuf for it */
|
|
|
|
/*
|
|
* Since this is simple mirroring, the first submission for a stripe is also
|
|
* treated as the last.
|
|
*/
|
|
|
|
t = NULL;
|
|
if (keep_it) {
|
|
if (rf_reconbufferDebug) {
|
|
printf("[%d] RAID1 rbuf submission: keeping rbuf\n", tid);
|
|
}
|
|
t = rbuf;
|
|
}
|
|
else {
|
|
if (use_committed) {
|
|
if (rf_reconbufferDebug) {
|
|
printf("[%d] RAID1 rbuf submission: using committed rbuf\n", tid);
|
|
}
|
|
t = reconCtrlPtr->committedRbufs;
|
|
RF_ASSERT(t);
|
|
reconCtrlPtr->committedRbufs = t->next;
|
|
t->next = NULL;
|
|
}
|
|
else if (reconCtrlPtr->floatingRbufs) {
|
|
if (rf_reconbufferDebug) {
|
|
printf("[%d] RAID1 rbuf submission: using floating rbuf\n", tid);
|
|
}
|
|
t = reconCtrlPtr->floatingRbufs;
|
|
reconCtrlPtr->floatingRbufs = t->next;
|
|
t->next = NULL;
|
|
}
|
|
}
|
|
if (t == NULL) {
|
|
if (rf_reconbufferDebug) {
|
|
printf("[%d] RAID1 rbuf submission: waiting for rbuf\n", tid);
|
|
}
|
|
RF_ASSERT((keep_it == 0) && (use_committed == 0));
|
|
raidPtr->procsInBufWait++;
|
|
if ((raidPtr->procsInBufWait == (raidPtr->numCol-1))
|
|
&& (raidPtr->numFullReconBuffers == 0))
|
|
{
|
|
/* ruh-ro */
|
|
RF_ERRORMSG("Buffer wait deadlock\n");
|
|
rf_PrintPSStatusTable(raidPtr, rbuf->row);
|
|
RF_PANIC();
|
|
}
|
|
pssPtr->flags |= RF_PSS_BUFFERWAIT;
|
|
cb = rf_AllocCallbackDesc();
|
|
cb->row = rbuf->row;
|
|
cb->col = rbuf->col;
|
|
cb->callbackArg.v = rbuf->parityStripeID;
|
|
cb->callbackArg2.v = rbuf->which_ru;
|
|
cb->next = NULL;
|
|
if (reconCtrlPtr->bufferWaitList == NULL) {
|
|
/* we are the wait list- lucky us */
|
|
reconCtrlPtr->bufferWaitList = cb;
|
|
}
|
|
else {
|
|
/* append to wait list */
|
|
for(p=reconCtrlPtr->bufferWaitList;p->next;p=p->next);
|
|
p->next = cb;
|
|
}
|
|
retcode = 1;
|
|
goto out;
|
|
}
|
|
if (t != rbuf) {
|
|
t->row = rbuf->row;
|
|
t->col = reconCtrlPtr->fcol;
|
|
t->parityStripeID = rbuf->parityStripeID;
|
|
t->which_ru = rbuf->which_ru;
|
|
t->failedDiskSectorOffset = rbuf->failedDiskSectorOffset;
|
|
t->spRow = rbuf->spRow;
|
|
t->spCol = rbuf->spCol;
|
|
t->spOffset = rbuf->spOffset;
|
|
/* Swap buffers. DANCE! */
|
|
ta = t->buffer;
|
|
t->buffer = rbuf->buffer;
|
|
rbuf->buffer = ta;
|
|
}
|
|
/*
|
|
* Use the rbuf we've been given as the target.
|
|
*/
|
|
RF_ASSERT(pssPtr->rbuf == NULL);
|
|
pssPtr->rbuf = t;
|
|
|
|
t->count = 1;
|
|
/*
|
|
* Below, we use 1 for numDataCol (which is equal to the count in the
|
|
* previous line), so we'll always be done.
|
|
*/
|
|
rf_CheckForFullRbuf(raidPtr, reconCtrlPtr, pssPtr, 1);
|
|
|
|
out:
|
|
RF_UNLOCK_PSS_MUTEX( raidPtr,rbuf->row,rbuf->parityStripeID);
|
|
RF_UNLOCK_MUTEX( reconCtrlPtr->rb_mutex );
|
|
if (rf_reconbufferDebug) {
|
|
printf("[%d] RAID1 rbuf submission: returning %d\n", tid, retcode);
|
|
}
|
|
return(retcode);
|
|
}
|