38a3987b69
Carnegie Mellon University. Full RAID implementation, including levels 0, 1, 4, 5, 6, parity logging, and a few other goodies. Ported to NetBSD by Greg Oster.
1026 lines
29 KiB
C
1026 lines
29 KiB
C
/* $NetBSD: rf_pq.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */
|
|
/*
|
|
* Copyright (c) 1995 Carnegie-Mellon University.
|
|
* All rights reserved.
|
|
*
|
|
* Author: Daniel Stodolsky
|
|
*
|
|
* Permission to use, copy, modify and distribute this software and
|
|
* its documentation is hereby granted, provided that both the copyright
|
|
* notice and this permission notice appear in all copies of the
|
|
* software, derivative works or modified versions, and any portions
|
|
* thereof, and that both notices appear in supporting documentation.
|
|
*
|
|
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
|
|
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
|
|
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
|
|
*
|
|
* Carnegie Mellon requests users of this software to return to
|
|
*
|
|
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
|
|
* School of Computer Science
|
|
* Carnegie Mellon University
|
|
* Pittsburgh PA 15213-3890
|
|
*
|
|
* any improvements or extensions that they make and grant Carnegie the
|
|
* rights to redistribute these changes.
|
|
*/
|
|
|
|
/*
|
|
* Code for RAID level 6 (P + Q) disk array architecture.
|
|
*
|
|
* :
|
|
* Log: rf_pq.c,v
|
|
* Revision 1.33 1996/11/05 21:10:40 jimz
|
|
* failed pda generalization
|
|
*
|
|
* Revision 1.32 1996/07/31 16:29:50 jimz
|
|
* "fix" math on 32-bit machines using RF_LONGSHIFT
|
|
* (may be incorrect)
|
|
*
|
|
* Revision 1.31 1996/07/31 15:35:01 jimz
|
|
* evenodd changes; bugfixes for double-degraded archs, generalize
|
|
* some formerly PQ-only functions
|
|
*
|
|
* Revision 1.30 1996/07/27 23:36:08 jimz
|
|
* Solaris port of simulator
|
|
*
|
|
* Revision 1.29 1996/07/22 19:52:16 jimz
|
|
* switched node params to RF_DagParam_t, a union of
|
|
* a 64-bit int and a void *, for better portability
|
|
* attempted hpux port, but failed partway through for
|
|
* lack of a single C compiler capable of compiling all
|
|
* source files
|
|
*
|
|
* Revision 1.28 1996/06/09 02:36:46 jimz
|
|
* lots of little crufty cleanup- fixup whitespace
|
|
* issues, comment #ifdefs, improve typing in some
|
|
* places (esp size-related)
|
|
*
|
|
* Revision 1.27 1996/06/07 21:33:04 jimz
|
|
* begin using consistent types for sector numbers,
|
|
* stripe numbers, row+col numbers, recon unit numbers
|
|
*
|
|
* Revision 1.26 1996/06/02 17:31:48 jimz
|
|
* Moved a lot of global stuff into array structure, where it belongs.
|
|
* Fixed up paritylogging, pss modules in this manner. Some general
|
|
* code cleanup. Removed lots of dead code, some dead files.
|
|
*
|
|
* Revision 1.25 1996/05/31 22:26:54 jimz
|
|
* fix a lot of mapping problems, memory allocation problems
|
|
* found some weird lock issues, fixed 'em
|
|
* more code cleanup
|
|
*
|
|
* Revision 1.24 1996/05/30 23:22:16 jimz
|
|
* bugfixes of serialization, timing problems
|
|
* more cleanup
|
|
*
|
|
* Revision 1.23 1996/05/30 12:59:18 jimz
|
|
* make etimer happier, more portable
|
|
*
|
|
* Revision 1.22 1996/05/27 18:56:37 jimz
|
|
* more code cleanup
|
|
* better typing
|
|
* compiles in all 3 environments
|
|
*
|
|
* Revision 1.21 1996/05/24 22:17:04 jimz
|
|
* continue code + namespace cleanup
|
|
* typed a bunch of flags
|
|
*
|
|
* Revision 1.20 1996/05/24 04:28:55 jimz
|
|
* release cleanup ckpt
|
|
*
|
|
* Revision 1.19 1996/05/23 21:46:35 jimz
|
|
* checkpoint in code cleanup (release prep)
|
|
* lots of types, function names have been fixed
|
|
*
|
|
* Revision 1.18 1996/05/23 00:33:23 jimz
|
|
* code cleanup: move all debug decls to rf_options.c, all extern
|
|
* debug decls to rf_options.h, all debug vars preceded by rf_
|
|
*
|
|
* Revision 1.17 1996/05/18 19:51:34 jimz
|
|
* major code cleanup- fix syntax, make some types consistent,
|
|
* add prototypes, clean out dead code, et cetera
|
|
*
|
|
* Revision 1.16 1996/05/17 14:52:04 wvcii
|
|
* added prototyping to QDelta()
|
|
* - changed buf params from volatile unsigned long * to char *
|
|
* changed QDelta for kernel
|
|
* - just bzero the buf since kernel doesn't include pq decode table
|
|
*
|
|
* Revision 1.15 1996/05/03 19:40:20 wvcii
|
|
* added includes for dag library
|
|
*
|
|
* Revision 1.14 1995/12/12 18:10:06 jimz
|
|
* MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
|
|
* fix 80-column brain damage in comments
|
|
*
|
|
* Revision 1.13 1995/11/30 16:19:55 wvcii
|
|
* added copyright info
|
|
*
|
|
* Revision 1.12 1995/11/07 16:13:47 wvcii
|
|
* changed PQDagSelect prototype
|
|
* function no longer returns numHdrSucc, numTermAnt
|
|
* note: this file contains node functions which should be
|
|
* moved to rf_dagfuncs.c so that all node funcs are bundled together
|
|
*
|
|
* Revision 1.11 1995/10/04 03:50:33 wvcii
|
|
* removed panics, minor code cleanup in dag selection
|
|
*
|
|
*
|
|
*/
|
|
|
|
#include "rf_archs.h"
|
|
#include "rf_types.h"
|
|
#include "rf_raid.h"
|
|
#include "rf_dag.h"
|
|
#include "rf_dagffrd.h"
|
|
#include "rf_dagffwr.h"
|
|
#include "rf_dagdegrd.h"
|
|
#include "rf_dagdegwr.h"
|
|
#include "rf_dagutils.h"
|
|
#include "rf_dagfuncs.h"
|
|
#include "rf_threadid.h"
|
|
#include "rf_etimer.h"
|
|
#include "rf_pqdeg.h"
|
|
#include "rf_general.h"
|
|
#include "rf_map.h"
|
|
#include "rf_pq.h"
|
|
#include "rf_sys.h"
|
|
|
|
RF_RedFuncs_t rf_pFuncs = { rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P" };
|
|
RF_RedFuncs_t rf_pRecoveryFuncs = { rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func" };
|
|
|
|
int rf_RegularONPFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
return(rf_RegularXorFunc(node));
|
|
}
|
|
|
|
/*
|
|
same as simpleONQ func, but the coefficient is always 1
|
|
*/
|
|
|
|
int rf_SimpleONPFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
return(rf_SimpleXorFunc(node));
|
|
}
|
|
|
|
int rf_RecoveryPFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
return(rf_RecoveryXorFunc(node));
|
|
}
|
|
|
|
int rf_RegularPFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
return(rf_RegularXorFunc(node));
|
|
}
|
|
|
|
#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
|
|
|
|
static void QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
|
|
unsigned char coeff);
|
|
static void rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
|
|
unsigned length, unsigned coeff);
|
|
|
|
RF_RedFuncs_t rf_qFuncs = { rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q" };
|
|
RF_RedFuncs_t rf_qRecoveryFuncs = { rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func" };
|
|
RF_RedFuncs_t rf_pqRecoveryFuncs = { rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func" };
|
|
|
|
void rf_PQDagSelect(
|
|
RF_Raid_t *raidPtr,
|
|
RF_IoType_t type,
|
|
RF_AccessStripeMap_t *asmap,
|
|
RF_VoidFuncPtr *createFunc)
|
|
{
|
|
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
|
|
unsigned ndfail = asmap->numDataFailed;
|
|
unsigned npfail = asmap->numParityFailed;
|
|
unsigned ntfail = npfail + ndfail;
|
|
|
|
RF_ASSERT(RF_IO_IS_R_OR_W(type));
|
|
if (ntfail > 2)
|
|
{
|
|
RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n");
|
|
/* *infoFunc = */ *createFunc = NULL;
|
|
return;
|
|
}
|
|
|
|
/* ok, we can do this I/O */
|
|
if (type == RF_IO_TYPE_READ)
|
|
{
|
|
switch (ndfail)
|
|
{
|
|
case 0:
|
|
/* fault free read */
|
|
*createFunc = rf_CreateFaultFreeReadDAG; /* same as raid 5 */
|
|
break;
|
|
case 1:
|
|
/* lost a single data unit */
|
|
/* two cases:
|
|
(1) parity is not lost.
|
|
do a normal raid 5 reconstruct read.
|
|
(2) parity is lost.
|
|
do a reconstruct read using "q".
|
|
*/
|
|
if (ntfail == 2) /* also lost redundancy */
|
|
{
|
|
if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
|
|
*createFunc = rf_PQ_110_CreateReadDAG;
|
|
else
|
|
*createFunc = rf_PQ_101_CreateReadDAG;
|
|
}
|
|
else
|
|
{
|
|
/* P and Q are ok. But is there a failure
|
|
in some unaccessed data unit?
|
|
*/
|
|
if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2)
|
|
*createFunc = rf_PQ_200_CreateReadDAG;
|
|
else
|
|
*createFunc = rf_PQ_100_CreateReadDAG;
|
|
}
|
|
break;
|
|
case 2:
|
|
/* lost two data units */
|
|
/* *infoFunc = PQOneTwo; */
|
|
*createFunc = rf_PQ_200_CreateReadDAG;
|
|
break;
|
|
}
|
|
return;
|
|
}
|
|
|
|
/* a write */
|
|
switch (ntfail)
|
|
{
|
|
case 0: /* fault free */
|
|
if (rf_suppressLocksAndLargeWrites ||
|
|
(((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
|
|
(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
|
|
|
|
*createFunc = rf_PQCreateSmallWriteDAG;
|
|
}
|
|
else {
|
|
*createFunc = rf_PQCreateLargeWriteDAG;
|
|
}
|
|
break;
|
|
|
|
case 1: /* single disk fault */
|
|
if (npfail==1)
|
|
{
|
|
RF_ASSERT ((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
|
|
if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q)
|
|
{ /* q died, treat like normal mode raid5 write.*/
|
|
if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
|
|
|| rf_NumFailedDataUnitsInStripe(raidPtr,asmap))
|
|
*createFunc = rf_PQ_001_CreateSmallWriteDAG;
|
|
else
|
|
*createFunc = rf_PQ_001_CreateLargeWriteDAG;
|
|
}
|
|
else
|
|
{ /* parity died, small write only updating Q */
|
|
if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
|
|
|| rf_NumFailedDataUnitsInStripe(raidPtr,asmap))
|
|
*createFunc = rf_PQ_010_CreateSmallWriteDAG;
|
|
else
|
|
*createFunc = rf_PQ_010_CreateLargeWriteDAG;
|
|
}
|
|
}
|
|
else
|
|
{ /* data missing.
|
|
Do a P reconstruct write if only a single data unit
|
|
is lost in the stripe, otherwise a PQ reconstruct
|
|
write. */
|
|
if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2)
|
|
*createFunc = rf_PQ_200_CreateWriteDAG;
|
|
else
|
|
*createFunc = rf_PQ_100_CreateWriteDAG;
|
|
}
|
|
break;
|
|
|
|
case 2: /* two disk faults */
|
|
switch (npfail)
|
|
{
|
|
case 2: /* both p and q dead */
|
|
*createFunc = rf_PQ_011_CreateWriteDAG;
|
|
break;
|
|
case 1: /* either p or q and dead data */
|
|
RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
|
|
RF_ASSERT ((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
|
|
if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
|
|
*createFunc = rf_PQ_101_CreateWriteDAG;
|
|
else
|
|
*createFunc = rf_PQ_110_CreateWriteDAG;
|
|
break;
|
|
case 0: /* double data loss */
|
|
*createFunc = rf_PQ_200_CreateWriteDAG;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
default: /* more than 2 disk faults */
|
|
*createFunc = NULL;
|
|
RF_PANIC();
|
|
}
|
|
return;
|
|
}
|
|
|
|
/*
|
|
Used as a stop gap info function
|
|
*/
|
|
static void PQOne(raidPtr, nSucc, nAnte, asmap)
|
|
RF_Raid_t *raidPtr;
|
|
int *nSucc;
|
|
int *nAnte;
|
|
RF_AccessStripeMap_t *asmap;
|
|
{
|
|
*nSucc = *nAnte = 1;
|
|
}
|
|
|
|
static void PQOneTwo(raidPtr, nSucc, nAnte, asmap)
|
|
RF_Raid_t *raidPtr;
|
|
int *nSucc;
|
|
int *nAnte;
|
|
RF_AccessStripeMap_t *asmap;
|
|
{
|
|
*nSucc = 1;
|
|
*nAnte = 2;
|
|
}
|
|
|
|
RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
|
|
{
|
|
rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
|
|
rf_RegularPQFunc, RF_FALSE);
|
|
}
|
|
|
|
int rf_RegularONQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
int np = node->numParams;
|
|
int d;
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[np-1].p;
|
|
int i;
|
|
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
|
|
RF_Etimer_t timer;
|
|
char *qbuf, *qpbuf;
|
|
char *obuf, *nbuf;
|
|
RF_PhysDiskAddr_t *old, *new;
|
|
unsigned long coeff;
|
|
unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
|
|
|
|
RF_ETIMER_START(timer);
|
|
|
|
d = (np-3)/4;
|
|
RF_ASSERT (4*d+3 == np);
|
|
qbuf = (char *) node->params[2*d+1].p; /* q buffer*/
|
|
for (i=0; i < d; i++)
|
|
{
|
|
old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
|
|
obuf = (char *) node->params[2*i+1].p;
|
|
new = (RF_PhysDiskAddr_t *) node->params[2*(d+1+i)].p;
|
|
nbuf = (char *) node->params[2*(d+1+i)+1].p;
|
|
RF_ASSERT (new->numSector == old->numSector);
|
|
RF_ASSERT (new->raidAddress == old->raidAddress);
|
|
/* the stripe unit within the stripe tells us the coefficient to use
|
|
for the multiply. */
|
|
coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),new->raidAddress);
|
|
/* compute the data unit offset within the column, then add one */
|
|
coeff = (coeff % raidPtr->Layout.numDataCol);
|
|
qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,old->startSector % secPerSU);
|
|
QDelta(qpbuf,obuf,nbuf, rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
|
|
}
|
|
|
|
RF_ETIMER_STOP(timer);
|
|
RF_ETIMER_EVAL(timer);
|
|
tracerec->q_us += RF_ETIMER_VAL_US(timer);
|
|
rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */
|
|
return(0);
|
|
}
|
|
|
|
/*
|
|
See the SimpleXORFunc for the difference between a simple and regular func.
|
|
These Q functions should be used for
|
|
|
|
new q = Q(data,old data,old q)
|
|
|
|
style updates and not for
|
|
|
|
q = ( new data, new data, .... )
|
|
|
|
computations.
|
|
|
|
The simple q takes 2(2d+1)+1 params, where d is the number
|
|
of stripes written. The order of params is
|
|
old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
|
|
[2d] old q pda_0, old q buffer
|
|
[2d_2] new data pda_0, new data buffer_0, ... new data pda_d, new data buffer_d
|
|
raidPtr
|
|
*/
|
|
|
|
int rf_SimpleONQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
int np = node->numParams;
|
|
int d;
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p;
|
|
int i;
|
|
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
|
|
RF_Etimer_t timer;
|
|
char *qbuf;
|
|
char *obuf, *nbuf;
|
|
RF_PhysDiskAddr_t *old, *new;
|
|
unsigned long coeff;
|
|
|
|
RF_ETIMER_START(timer);
|
|
|
|
d = (np-3)/4;
|
|
RF_ASSERT (4*d+3 == np);
|
|
qbuf = (char *) node->params[2*d+1].p; /* q buffer*/
|
|
for (i=0; i < d; i++)
|
|
{
|
|
old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
|
|
obuf = (char *) node->params[2*i+1].p;
|
|
new = (RF_PhysDiskAddr_t *) node->params[2*(d+1+i)].p;
|
|
nbuf = (char *) node->params[2*(d+1+i)+1].p;
|
|
RF_ASSERT (new->numSector == old->numSector);
|
|
RF_ASSERT (new->raidAddress == old->raidAddress);
|
|
/* the stripe unit within the stripe tells us the coefficient to use
|
|
for the multiply. */
|
|
coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),new->raidAddress);
|
|
/* compute the data unit offset within the column, then add one */
|
|
coeff = (coeff % raidPtr->Layout.numDataCol);
|
|
QDelta(qbuf,obuf,nbuf, rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
|
|
}
|
|
|
|
RF_ETIMER_STOP(timer);
|
|
RF_ETIMER_EVAL(timer);
|
|
tracerec->q_us += RF_ETIMER_VAL_US(timer);
|
|
rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */
|
|
return(0);
|
|
}
|
|
|
|
RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
|
|
{
|
|
rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
|
|
}
|
|
|
|
static void RegularQSubr(node,qbuf)
|
|
RF_DagNode_t *node;
|
|
char *qbuf;
|
|
{
|
|
int np = node->numParams;
|
|
int d;
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p;
|
|
unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
|
|
int i;
|
|
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
|
|
RF_Etimer_t timer;
|
|
char *obuf, *qpbuf;
|
|
RF_PhysDiskAddr_t *old;
|
|
unsigned long coeff;
|
|
|
|
RF_ETIMER_START(timer);
|
|
|
|
d = (np-1)/2;
|
|
RF_ASSERT (2*d+1 == np);
|
|
for (i=0; i < d; i++)
|
|
{
|
|
old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
|
|
obuf = (char *) node->params[2*i+1].p;
|
|
coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress);
|
|
/* compute the data unit offset within the column, then add one */
|
|
coeff = (coeff % raidPtr->Layout.numDataCol);
|
|
/* the input buffers may not all be aligned with the start of the
|
|
stripe. so shift by their sector offset within the stripe unit */
|
|
qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,old->startSector % secPerSU);
|
|
rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
|
|
}
|
|
|
|
RF_ETIMER_STOP(timer);
|
|
RF_ETIMER_EVAL(timer);
|
|
tracerec->q_us += RF_ETIMER_VAL_US(timer);
|
|
}
|
|
|
|
/*
|
|
used in degraded writes.
|
|
*/
|
|
|
|
static void DegrQSubr(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
int np = node->numParams;
|
|
int d;
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p;
|
|
unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
|
|
int i;
|
|
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
|
|
RF_Etimer_t timer;
|
|
char *qbuf = node->results[1];
|
|
char *obuf, *qpbuf;
|
|
RF_PhysDiskAddr_t *old;
|
|
unsigned long coeff;
|
|
unsigned fail_start;
|
|
int j;
|
|
|
|
old = (RF_PhysDiskAddr_t *)node->params[np-2].p;
|
|
fail_start = old->startSector % secPerSU;
|
|
|
|
RF_ETIMER_START(timer);
|
|
|
|
d = (np-2)/2;
|
|
RF_ASSERT (2*d+2 == np);
|
|
for (i=0; i < d; i++)
|
|
{
|
|
old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
|
|
obuf = (char *) node->params[2*i+1].p;
|
|
coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress);
|
|
/* compute the data unit offset within the column, then add one */
|
|
coeff = (coeff % raidPtr->Layout.numDataCol);
|
|
/* the input buffers may not all be aligned with the start of the
|
|
stripe. so shift by their sector offset within the stripe unit */
|
|
j = old->startSector % secPerSU;
|
|
RF_ASSERT(j >= fail_start);
|
|
qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,j - fail_start);
|
|
rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
|
|
}
|
|
|
|
RF_ETIMER_STOP(timer);
|
|
RF_ETIMER_EVAL(timer);
|
|
tracerec->q_us += RF_ETIMER_VAL_US(timer);
|
|
}
|
|
|
|
/*
|
|
Called by large write code to compute the new parity and the new q.
|
|
|
|
structure of the params:
|
|
|
|
pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
|
|
raidPtr
|
|
|
|
for a total of 2d+1 arguments.
|
|
The result buffers results[0], results[1] are the buffers for the p and q,
|
|
respectively.
|
|
|
|
We compute Q first, then compute P. The P calculation may try to reuse
|
|
one of the input buffers for its output, so if we computed P first, we would
|
|
corrupt the input for the q calculation.
|
|
*/
|
|
|
|
int rf_RegularPQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
RegularQSubr(node,node->results[1]);
|
|
return(rf_RegularXorFunc(node)); /* does the wakeup */
|
|
}
|
|
|
|
int rf_RegularQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
/* Almost ... adjust Qsubr args */
|
|
RegularQSubr(node, node->results[0]);
|
|
rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */
|
|
return(0);
|
|
}
|
|
|
|
/*
|
|
Called by singly degraded write code to compute the new parity and the new q.
|
|
|
|
structure of the params:
|
|
|
|
pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
|
|
failedPDA raidPtr
|
|
|
|
for a total of 2d+2 arguments.
|
|
The result buffers results[0], results[1] are the buffers for the parity and q,
|
|
respectively.
|
|
|
|
We compute Q first, then compute parity. The parity calculation may try to reuse
|
|
one of the input buffers for its output, so if we computed parity first, we would
|
|
corrupt the input for the q calculation.
|
|
|
|
We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
|
|
*/
|
|
|
|
void rf_Degraded_100_PQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
int np = node->numParams;
|
|
|
|
RF_ASSERT (np >= 2);
|
|
DegrQSubr(node);
|
|
rf_RecoveryXorFunc(node);
|
|
}
|
|
|
|
|
|
/*
|
|
The two below are used when reading a stripe with a single lost data unit.
|
|
The parameters are
|
|
|
|
pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
|
|
|
|
and results[0] contains the data buffer. Which is originally zero-filled.
|
|
|
|
*/
|
|
|
|
/* this Q func is used by the degraded-mode dag functions to recover lost data.
|
|
* the second-to-last parameter is the PDA for the failed portion of the access.
|
|
* the code here looks at this PDA and assumes that the xor target buffer is
|
|
* equal in size to the number of sectors in the failed PDA. It then uses
|
|
* the other PDAs in the parameter list to determine where within the target
|
|
* buffer the corresponding data should be xored.
|
|
*
|
|
* Recall the basic equation is
|
|
*
|
|
* Q = ( data_1 + 2 * data_2 ... + k * data_k ) mod 256
|
|
*
|
|
* so to recover data_j we need
|
|
*
|
|
* J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
|
|
*
|
|
* So the coefficient for each buffer is (255 - data_col), and j should be initialized by
|
|
* copying Q into it. Then we need to do a table lookup to convert to solve
|
|
* data_j /= J
|
|
*
|
|
*
|
|
*/
|
|
int rf_RecoveryQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
|
|
RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
|
|
RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p;
|
|
int i;
|
|
RF_PhysDiskAddr_t *pda;
|
|
RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector);
|
|
char *srcbuf, *destbuf;
|
|
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
|
|
RF_Etimer_t timer;
|
|
unsigned long coeff;
|
|
|
|
RF_ETIMER_START(timer);
|
|
/* start by copying Q into the buffer */
|
|
bcopy(node->params[node->numParams-3].p,node->results[0],
|
|
rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
|
|
for (i=0; i<node->numParams-4; i+=2)
|
|
{
|
|
RF_ASSERT (node->params[i+1].p != node->results[0]);
|
|
pda = (RF_PhysDiskAddr_t *) node->params[i].p;
|
|
srcbuf = (char *) node->params[i+1].p;
|
|
suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
|
|
destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset);
|
|
coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),pda->raidAddress);
|
|
/* compute the data unit offset within the column */
|
|
coeff = (coeff % raidPtr->Layout.numDataCol);
|
|
rf_IncQ((unsigned long *)destbuf, (unsigned long *)srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
|
|
}
|
|
/* Do the nasty inversion now */
|
|
coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),failedPDA->startSector) % raidPtr->Layout.numDataCol);
|
|
rf_InvertQ(node->results[0],node->results[0],rf_RaidAddressToByte(raidPtr,pda->numSector),coeff);
|
|
RF_ETIMER_STOP(timer);
|
|
RF_ETIMER_EVAL(timer);
|
|
tracerec->q_us += RF_ETIMER_VAL_US(timer);
|
|
rf_GenericWakeupFunc(node, 0);
|
|
return(0);
|
|
}
|
|
|
|
int rf_RecoveryPQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
RF_PANIC();
|
|
return(1);
|
|
}
|
|
|
|
/*
|
|
Degraded write Q subroutine.
|
|
Used when P is dead.
|
|
Large-write style Q computation.
|
|
Parameters
|
|
|
|
(pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
|
|
|
|
We ignore failedPDA.
|
|
|
|
This is a "simple style" recovery func.
|
|
*/
|
|
|
|
void rf_PQ_DegradedWriteQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
int np = node->numParams;
|
|
int d;
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p;
|
|
unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
|
|
int i;
|
|
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
|
|
RF_Etimer_t timer;
|
|
char *qbuf = node->results[0];
|
|
char *obuf, *qpbuf;
|
|
RF_PhysDiskAddr_t *old;
|
|
unsigned long coeff;
|
|
int fail_start,j;
|
|
|
|
old = (RF_PhysDiskAddr_t *) node->params[np-2].p;
|
|
fail_start = old->startSector % secPerSU;
|
|
|
|
RF_ETIMER_START(timer);
|
|
|
|
d = (np-2)/2;
|
|
RF_ASSERT (2*d+2 == np);
|
|
|
|
for (i=0; i < d; i++)
|
|
{
|
|
old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
|
|
obuf = (char *) node->params[2*i+1].p;
|
|
coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress);
|
|
/* compute the data unit offset within the column, then add one */
|
|
coeff = (coeff % raidPtr->Layout.numDataCol);
|
|
j = old->startSector % secPerSU;
|
|
RF_ASSERT(j >= fail_start);
|
|
qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,j - fail_start);
|
|
rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
|
|
}
|
|
|
|
RF_ETIMER_STOP(timer);
|
|
RF_ETIMER_EVAL(timer);
|
|
tracerec->q_us += RF_ETIMER_VAL_US(timer);
|
|
rf_GenericWakeupFunc(node, 0);
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Q computations */
|
|
|
|
/*
|
|
coeff - colummn;
|
|
|
|
compute dest ^= qfor[28-coeff][rn[coeff+1] a]
|
|
|
|
on 5-bit basis;
|
|
length in bytes;
|
|
*/
|
|
|
|
void rf_IncQ(dest,buf,length,coeff)
|
|
unsigned long *dest;
|
|
unsigned long *buf;
|
|
unsigned length;
|
|
unsigned coeff;
|
|
{
|
|
unsigned long a, d, new;
|
|
unsigned long a1, a2;
|
|
unsigned int *q = &(rf_qfor[28-coeff][0]);
|
|
unsigned r = rf_rn[coeff+1];
|
|
|
|
#define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
|
|
#define INSERT(a,i) (a << (5L*i))
|
|
|
|
length /= 8;
|
|
/* 13 5 bit quants in a 64 bit word */
|
|
while (length)
|
|
{
|
|
a = *buf++;
|
|
d = *dest;
|
|
a1 = EXTRACT(a,0) ^ r;
|
|
a2 = EXTRACT(a,1) ^ r;
|
|
new = INSERT(a2,1) | a1 ;
|
|
a1 = EXTRACT(a,2) ^ r;
|
|
a2 = EXTRACT(a,3) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1,2) | INSERT (a2,3);
|
|
a1 = EXTRACT(a,4) ^ r;
|
|
a2 = EXTRACT(a,5) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1,4) | INSERT (a2,5);
|
|
a1 = EXTRACT(a,5) ^ r;
|
|
a2 = EXTRACT(a,6) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1,5) | INSERT (a2,6);
|
|
#if RF_LONGSHIFT > 2
|
|
a1 = EXTRACT(a,7) ^ r;
|
|
a2 = EXTRACT(a,8) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1,7) | INSERT (a2,8);
|
|
a1 = EXTRACT(a,9) ^ r;
|
|
a2 = EXTRACT(a,10) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1,9) | INSERT (a2,10);
|
|
a1 = EXTRACT(a,11) ^ r;
|
|
a2 = EXTRACT(a,12) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1,11) | INSERT (a2,12);
|
|
#endif /* RF_LONGSHIFT > 2 */
|
|
d ^= new;
|
|
*dest++ = d;
|
|
length--;
|
|
}
|
|
}
|
|
|
|
/*
|
|
compute
|
|
|
|
dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
|
|
|
|
on a five bit basis.
|
|
optimization: compute old ^ new on 64 bit basis.
|
|
|
|
length in bytes.
|
|
*/
|
|
|
|
static void QDelta(
|
|
char *dest,
|
|
char *obuf,
|
|
char *nbuf,
|
|
unsigned length,
|
|
unsigned char coeff)
|
|
{
|
|
unsigned long a, d, new;
|
|
unsigned long a1, a2;
|
|
unsigned int *q = &(rf_qfor[28-coeff][0]);
|
|
unsigned r = rf_rn[coeff+1];
|
|
|
|
#ifdef KERNEL
|
|
/* PQ in kernel currently not supported because the encoding/decoding table is not present */
|
|
bzero(dest, length);
|
|
#else /* KERNEL */
|
|
/* this code probably doesn't work and should be rewritten -wvcii */
|
|
/* 13 5 bit quants in a 64 bit word */
|
|
length /= 8;
|
|
while (length)
|
|
{
|
|
a = *obuf++; /* XXX need to reorg to avoid cache conflicts */
|
|
a ^= *nbuf++;
|
|
d = *dest;
|
|
a1 = EXTRACT(a,0) ^ r;
|
|
a2 = EXTRACT(a,1) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = INSERT(a2,1) | a1 ;
|
|
a1 = EXTRACT(a,2) ^ r;
|
|
a2 = EXTRACT(a,3) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1,2) | INSERT (a2,3);
|
|
a1 = EXTRACT(a,4) ^ r;
|
|
a2 = EXTRACT(a,5) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1,4) | INSERT (a2,5);
|
|
a1 = EXTRACT(a,5) ^ r;
|
|
a2 = EXTRACT(a,6) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1,5) | INSERT (a2,6);
|
|
#if RF_LONGSHIFT > 2
|
|
a1 = EXTRACT(a,7) ^ r;
|
|
a2 = EXTRACT(a,8) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1,7) | INSERT (a2,8);
|
|
a1 = EXTRACT(a,9) ^ r;
|
|
a2 = EXTRACT(a,10) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1,9) | INSERT (a2,10);
|
|
a1 = EXTRACT(a,11) ^ r;
|
|
a2 = EXTRACT(a,12) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1,11) | INSERT (a2,12);
|
|
#endif /* RF_LONGSHIFT > 2 */
|
|
d ^= new;
|
|
*dest++ = d;
|
|
length--;
|
|
}
|
|
#endif /* KERNEL */
|
|
}
|
|
|
|
/*
|
|
recover columns a and b from the given p and q into
|
|
bufs abuf and bbuf. All bufs are word aligned.
|
|
Length is in bytes.
|
|
*/
|
|
|
|
|
|
/*
|
|
* XXX
|
|
*
|
|
* Everything about this seems wrong.
|
|
*/
|
|
void rf_PQ_recover(pbuf,qbuf,abuf,bbuf,length,coeff_a,coeff_b)
|
|
unsigned long *pbuf;
|
|
unsigned long *qbuf;
|
|
unsigned long *abuf;
|
|
unsigned long *bbuf;
|
|
unsigned length;
|
|
unsigned coeff_a;
|
|
unsigned coeff_b;
|
|
{
|
|
unsigned long p, q, a, a0, a1;
|
|
int col = (29 * coeff_a) + coeff_b;
|
|
unsigned char *q0 = & (rf_qinv[col][0]);
|
|
|
|
length /= 8;
|
|
while (length)
|
|
{
|
|
p = *pbuf++;
|
|
q = *qbuf++;
|
|
a0 = EXTRACT(p,0);
|
|
a1 = EXTRACT(q,0);
|
|
a = q0[a0<<5 | a1];
|
|
#define MF(i) \
|
|
a0 = EXTRACT(p,i); \
|
|
a1 = EXTRACT(q,i); \
|
|
a = a | INSERT(q0[a0<<5 | a1],i)
|
|
|
|
MF(1);
|
|
MF(2);
|
|
MF(3);
|
|
MF(4);
|
|
MF(5);
|
|
MF(6);
|
|
#if 0
|
|
MF(7);
|
|
MF(8);
|
|
MF(9);
|
|
MF(10);
|
|
MF(11);
|
|
MF(12);
|
|
#endif /* 0 */
|
|
*abuf++ = a;
|
|
*bbuf++ = a ^ p;
|
|
length--;
|
|
}
|
|
}
|
|
|
|
/*
|
|
Lost parity and a data column. Recover that data column.
|
|
Assume col coeff is lost. Let q the contents of Q after
|
|
all surviving data columns have been q-xored out of it.
|
|
Then we have the equation
|
|
|
|
q[28-coeff][a_i ^ r_i+1] = q
|
|
|
|
but q is cyclic with period 31.
|
|
So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
|
|
q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
|
|
|
|
so a_i = r_{coeff+1} ^ q[3+coeff][q]
|
|
|
|
The routine is passed q buffer and the buffer
|
|
the data is to be recoverd into. They can be the same.
|
|
*/
|
|
|
|
|
|
|
|
static void rf_InvertQ(
|
|
unsigned long *qbuf,
|
|
unsigned long *abuf,
|
|
unsigned length,
|
|
unsigned coeff)
|
|
{
|
|
unsigned long a, new;
|
|
unsigned long a1, a2;
|
|
unsigned int *q = &(rf_qfor[3+coeff][0]);
|
|
unsigned r = rf_rn[coeff+1];
|
|
|
|
/* 13 5 bit quants in a 64 bit word */
|
|
length /= 8;
|
|
while (length)
|
|
{
|
|
a = *qbuf++;
|
|
a1 = EXTRACT(a,0);
|
|
a2 = EXTRACT(a,1);
|
|
a1 = r ^ q[a1];
|
|
a2 = r ^ q[a2];
|
|
new = INSERT(a2,1) | a1;
|
|
#define M(i,j) \
|
|
a1 = EXTRACT(a,i); \
|
|
a2 = EXTRACT(a,j); \
|
|
a1 = r ^ q[a1]; \
|
|
a2 = r ^ q[a2]; \
|
|
new = new | INSERT(a1,i) | INSERT(a2,j)
|
|
|
|
M(2,3);
|
|
M(4,5);
|
|
M(5,6);
|
|
#if RF_LONGSHIFT > 2
|
|
M(7,8);
|
|
M(9,10);
|
|
M(11,12);
|
|
#endif /* RF_LONGSHIFT > 2 */
|
|
*abuf++ = new;
|
|
length--;
|
|
}
|
|
}
|
|
|
|
#endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
|