933 lines
26 KiB
C
933 lines
26 KiB
C
/* $NetBSD: rf_pq.c,v 1.13 2003/11/16 20:32:05 oster Exp $ */
|
|
/*
|
|
* Copyright (c) 1995 Carnegie-Mellon University.
|
|
* All rights reserved.
|
|
*
|
|
* Author: Daniel Stodolsky
|
|
*
|
|
* Permission to use, copy, modify and distribute this software and
|
|
* its documentation is hereby granted, provided that both the copyright
|
|
* notice and this permission notice appear in all copies of the
|
|
* software, derivative works or modified versions, and any portions
|
|
* thereof, and that both notices appear in supporting documentation.
|
|
*
|
|
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
|
|
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
|
|
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
|
|
*
|
|
* Carnegie Mellon requests users of this software to return to
|
|
*
|
|
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
|
|
* School of Computer Science
|
|
* Carnegie Mellon University
|
|
* Pittsburgh PA 15213-3890
|
|
*
|
|
* any improvements or extensions that they make and grant Carnegie the
|
|
* rights to redistribute these changes.
|
|
*/
|
|
|
|
/*
|
|
* Code for RAID level 6 (P + Q) disk array architecture.
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__KERNEL_RCSID(0, "$NetBSD: rf_pq.c,v 1.13 2003/11/16 20:32:05 oster Exp $");
|
|
|
|
#include "rf_archs.h"
|
|
|
|
#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0)
|
|
|
|
#include <dev/raidframe/raidframevar.h>
|
|
|
|
#include "rf_raid.h"
|
|
#include "rf_dag.h"
|
|
#include "rf_dagffrd.h"
|
|
#include "rf_dagffwr.h"
|
|
#include "rf_dagdegrd.h"
|
|
#include "rf_dagdegwr.h"
|
|
#include "rf_dagutils.h"
|
|
#include "rf_dagfuncs.h"
|
|
#include "rf_etimer.h"
|
|
#include "rf_pqdeg.h"
|
|
#include "rf_general.h"
|
|
#include "rf_map.h"
|
|
#include "rf_pq.h"
|
|
|
|
RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
|
|
RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
|
|
|
|
int
|
|
rf_RegularONPFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
return (rf_RegularXorFunc(node));
|
|
}
|
|
/*
|
|
same as simpleONQ func, but the coefficient is always 1
|
|
*/
|
|
|
|
int
|
|
rf_SimpleONPFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
return (rf_SimpleXorFunc(node));
|
|
}
|
|
|
|
int
|
|
rf_RecoveryPFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
return (rf_RecoveryXorFunc(node));
|
|
}
|
|
|
|
int
|
|
rf_RegularPFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
return (rf_RegularXorFunc(node));
|
|
}
|
|
#endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */
|
|
#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
|
|
|
|
static void
|
|
QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
|
|
unsigned char coeff);
|
|
static void
|
|
rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
|
|
unsigned length, unsigned coeff);
|
|
|
|
RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
|
|
RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
|
|
RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
|
|
|
|
void
|
|
rf_PQDagSelect(
|
|
RF_Raid_t * raidPtr,
|
|
RF_IoType_t type,
|
|
RF_AccessStripeMap_t * asmap,
|
|
RF_VoidFuncPtr * createFunc)
|
|
{
|
|
RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
|
|
unsigned ndfail = asmap->numDataFailed;
|
|
unsigned npfail = asmap->numParityFailed;
|
|
unsigned ntfail = npfail + ndfail;
|
|
|
|
RF_ASSERT(RF_IO_IS_R_OR_W(type));
|
|
if (ntfail > 2) {
|
|
RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n");
|
|
*createFunc = NULL;
|
|
return;
|
|
}
|
|
/* ok, we can do this I/O */
|
|
if (type == RF_IO_TYPE_READ) {
|
|
switch (ndfail) {
|
|
case 0:
|
|
/* fault free read */
|
|
*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG; /* same as raid 5 */
|
|
break;
|
|
case 1:
|
|
/* lost a single data unit */
|
|
/* two cases: (1) parity is not lost. do a normal raid
|
|
* 5 reconstruct read. (2) parity is lost. do a
|
|
* reconstruct read using "q". */
|
|
if (ntfail == 2) { /* also lost redundancy */
|
|
if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
|
|
else
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
|
|
} else {
|
|
/* P and Q are ok. But is there a failure in
|
|
* some unaccessed data unit? */
|
|
if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
|
|
else
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
|
|
}
|
|
break;
|
|
case 2:
|
|
/* lost two data units */
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
|
|
break;
|
|
}
|
|
return;
|
|
}
|
|
/* a write */
|
|
switch (ntfail) {
|
|
case 0: /* fault free */
|
|
if (rf_suppressLocksAndLargeWrites ||
|
|
(((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
|
|
(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
|
|
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
|
|
} else {
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
|
|
}
|
|
break;
|
|
|
|
case 1: /* single disk fault */
|
|
if (npfail == 1) {
|
|
RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
|
|
if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) { /* q died, treat like
|
|
* normal mode raid5
|
|
* write. */
|
|
if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
|
|
|| rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
|
|
else
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
|
|
} else {/* parity died, small write only updating Q */
|
|
if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
|
|
|| rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
|
|
else
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
|
|
}
|
|
} else { /* data missing. Do a P reconstruct write if
|
|
* only a single data unit is lost in the
|
|
* stripe, otherwise a PQ reconstruct write. */
|
|
if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
|
|
else
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
|
|
}
|
|
break;
|
|
|
|
case 2: /* two disk faults */
|
|
switch (npfail) {
|
|
case 2: /* both p and q dead */
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
|
|
break;
|
|
case 1: /* either p or q and dead data */
|
|
RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
|
|
RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
|
|
if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
|
|
else
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
|
|
break;
|
|
case 0: /* double data loss */
|
|
*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
default: /* more than 2 disk faults */
|
|
*createFunc = NULL;
|
|
RF_PANIC();
|
|
}
|
|
return;
|
|
}
|
|
/*
|
|
Used as a stop gap info function
|
|
*/
|
|
#if 0
|
|
static void
|
|
PQOne(raidPtr, nSucc, nAnte, asmap)
|
|
RF_Raid_t *raidPtr;
|
|
int *nSucc;
|
|
int *nAnte;
|
|
RF_AccessStripeMap_t *asmap;
|
|
{
|
|
*nSucc = *nAnte = 1;
|
|
}
|
|
|
|
static void
|
|
PQOneTwo(raidPtr, nSucc, nAnte, asmap)
|
|
RF_Raid_t *raidPtr;
|
|
int *nSucc;
|
|
int *nAnte;
|
|
RF_AccessStripeMap_t *asmap;
|
|
{
|
|
*nSucc = 1;
|
|
*nAnte = 2;
|
|
}
|
|
#endif
|
|
|
|
RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
|
|
{
|
|
rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
|
|
rf_RegularPQFunc, RF_FALSE);
|
|
}
|
|
|
|
int
|
|
rf_RegularONQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
int np = node->numParams;
|
|
int d;
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
|
|
int i;
|
|
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
|
|
RF_Etimer_t timer;
|
|
char *qbuf, *qpbuf;
|
|
char *obuf, *nbuf;
|
|
RF_PhysDiskAddr_t *old, *new;
|
|
unsigned long coeff;
|
|
unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
|
|
|
|
RF_ETIMER_START(timer);
|
|
|
|
d = (np - 3) / 4;
|
|
RF_ASSERT(4 * d + 3 == np);
|
|
qbuf = (char *) node->params[2 * d + 1].p; /* q buffer */
|
|
for (i = 0; i < d; i++) {
|
|
old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
|
|
obuf = (char *) node->params[2 * i + 1].p;
|
|
new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
|
|
nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
|
|
RF_ASSERT(new->numSector == old->numSector);
|
|
RF_ASSERT(new->raidAddress == old->raidAddress);
|
|
/* the stripe unit within the stripe tells us the coefficient
|
|
* to use for the multiply. */
|
|
coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
|
|
/* compute the data unit offset within the column, then add
|
|
* one */
|
|
coeff = (coeff % raidPtr->Layout.numDataCol);
|
|
qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
|
|
QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
|
|
}
|
|
|
|
RF_ETIMER_STOP(timer);
|
|
RF_ETIMER_EVAL(timer);
|
|
tracerec->q_us += RF_ETIMER_VAL_US(timer);
|
|
rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no
|
|
* I/O in this node */
|
|
return (0);
|
|
}
|
|
/*
|
|
See the SimpleXORFunc for the difference between a simple and regular func.
|
|
These Q functions should be used for
|
|
|
|
new q = Q(data,old data,old q)
|
|
|
|
style updates and not for
|
|
|
|
q = ( new data, new data, .... )
|
|
|
|
computations.
|
|
|
|
The simple q takes 2(2d+1)+1 params, where d is the number
|
|
of stripes written. The order of params is
|
|
old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
|
|
[2d] old q pda_0, old q buffer
|
|
[2d_2] new data pda_0, new data buffer_0, ... new data pda_d, new data buffer_d
|
|
raidPtr
|
|
*/
|
|
|
|
int
|
|
rf_SimpleONQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
int np = node->numParams;
|
|
int d;
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
|
|
int i;
|
|
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
|
|
RF_Etimer_t timer;
|
|
char *qbuf;
|
|
char *obuf, *nbuf;
|
|
RF_PhysDiskAddr_t *old, *new;
|
|
unsigned long coeff;
|
|
|
|
RF_ETIMER_START(timer);
|
|
|
|
d = (np - 3) / 4;
|
|
RF_ASSERT(4 * d + 3 == np);
|
|
qbuf = (char *) node->params[2 * d + 1].p; /* q buffer */
|
|
for (i = 0; i < d; i++) {
|
|
old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
|
|
obuf = (char *) node->params[2 * i + 1].p;
|
|
new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
|
|
nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
|
|
RF_ASSERT(new->numSector == old->numSector);
|
|
RF_ASSERT(new->raidAddress == old->raidAddress);
|
|
/* the stripe unit within the stripe tells us the coefficient
|
|
* to use for the multiply. */
|
|
coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
|
|
/* compute the data unit offset within the column, then add
|
|
* one */
|
|
coeff = (coeff % raidPtr->Layout.numDataCol);
|
|
QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
|
|
}
|
|
|
|
RF_ETIMER_STOP(timer);
|
|
RF_ETIMER_EVAL(timer);
|
|
tracerec->q_us += RF_ETIMER_VAL_US(timer);
|
|
rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no
|
|
* I/O in this node */
|
|
return (0);
|
|
}
|
|
RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
|
|
{
|
|
rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
|
|
}
|
|
|
|
static void RegularQSubr(RF_DagNode_t *node, char *qbuf);
|
|
|
|
static void
|
|
RegularQSubr(node, qbuf)
|
|
RF_DagNode_t *node;
|
|
char *qbuf;
|
|
{
|
|
int np = node->numParams;
|
|
int d;
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
|
|
unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
|
|
int i;
|
|
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
|
|
RF_Etimer_t timer;
|
|
char *obuf, *qpbuf;
|
|
RF_PhysDiskAddr_t *old;
|
|
unsigned long coeff;
|
|
|
|
RF_ETIMER_START(timer);
|
|
|
|
d = (np - 1) / 2;
|
|
RF_ASSERT(2 * d + 1 == np);
|
|
for (i = 0; i < d; i++) {
|
|
old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
|
|
obuf = (char *) node->params[2 * i + 1].p;
|
|
coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
|
|
/* compute the data unit offset within the column, then add
|
|
* one */
|
|
coeff = (coeff % raidPtr->Layout.numDataCol);
|
|
/* the input buffers may not all be aligned with the start of
|
|
* the stripe. so shift by their sector offset within the
|
|
* stripe unit */
|
|
qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
|
|
rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
|
|
}
|
|
|
|
RF_ETIMER_STOP(timer);
|
|
RF_ETIMER_EVAL(timer);
|
|
tracerec->q_us += RF_ETIMER_VAL_US(timer);
|
|
}
|
|
/*
|
|
used in degraded writes.
|
|
*/
|
|
|
|
static void DegrQSubr(RF_DagNode_t *node);
|
|
|
|
static void
|
|
DegrQSubr(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
int np = node->numParams;
|
|
int d;
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
|
|
unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
|
|
int i;
|
|
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
|
|
RF_Etimer_t timer;
|
|
char *qbuf = node->results[1];
|
|
char *obuf, *qpbuf;
|
|
RF_PhysDiskAddr_t *old;
|
|
unsigned long coeff;
|
|
unsigned fail_start;
|
|
int j;
|
|
|
|
old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
|
|
fail_start = old->startSector % secPerSU;
|
|
|
|
RF_ETIMER_START(timer);
|
|
|
|
d = (np - 2) / 2;
|
|
RF_ASSERT(2 * d + 2 == np);
|
|
for (i = 0; i < d; i++) {
|
|
old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
|
|
obuf = (char *) node->params[2 * i + 1].p;
|
|
coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
|
|
/* compute the data unit offset within the column, then add
|
|
* one */
|
|
coeff = (coeff % raidPtr->Layout.numDataCol);
|
|
/* the input buffers may not all be aligned with the start of
|
|
* the stripe. so shift by their sector offset within the
|
|
* stripe unit */
|
|
j = old->startSector % secPerSU;
|
|
RF_ASSERT(j >= fail_start);
|
|
qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
|
|
rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
|
|
}
|
|
|
|
RF_ETIMER_STOP(timer);
|
|
RF_ETIMER_EVAL(timer);
|
|
tracerec->q_us += RF_ETIMER_VAL_US(timer);
|
|
}
|
|
/*
|
|
Called by large write code to compute the new parity and the new q.
|
|
|
|
structure of the params:
|
|
|
|
pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
|
|
raidPtr
|
|
|
|
for a total of 2d+1 arguments.
|
|
The result buffers results[0], results[1] are the buffers for the p and q,
|
|
respectively.
|
|
|
|
We compute Q first, then compute P. The P calculation may try to reuse
|
|
one of the input buffers for its output, so if we computed P first, we would
|
|
corrupt the input for the q calculation.
|
|
*/
|
|
|
|
int
|
|
rf_RegularPQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
RegularQSubr(node, node->results[1]);
|
|
return (rf_RegularXorFunc(node)); /* does the wakeup */
|
|
}
|
|
|
|
int
|
|
rf_RegularQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
/* Almost ... adjust Qsubr args */
|
|
RegularQSubr(node, node->results[0]);
|
|
rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no
|
|
* I/O in this node */
|
|
return (0);
|
|
}
|
|
/*
|
|
Called by singly degraded write code to compute the new parity and the new q.
|
|
|
|
structure of the params:
|
|
|
|
pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
|
|
failedPDA raidPtr
|
|
|
|
for a total of 2d+2 arguments.
|
|
The result buffers results[0], results[1] are the buffers for the parity and q,
|
|
respectively.
|
|
|
|
We compute Q first, then compute parity. The parity calculation may try to reuse
|
|
one of the input buffers for its output, so if we computed parity first, we would
|
|
corrupt the input for the q calculation.
|
|
|
|
We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
|
|
*/
|
|
|
|
void
|
|
rf_Degraded_100_PQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
int np = node->numParams;
|
|
|
|
RF_ASSERT(np >= 2);
|
|
DegrQSubr(node);
|
|
rf_RecoveryXorFunc(node);
|
|
}
|
|
|
|
|
|
/*
|
|
The two below are used when reading a stripe with a single lost data unit.
|
|
The parameters are
|
|
|
|
pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
|
|
|
|
and results[0] contains the data buffer. Which is originally zero-filled.
|
|
|
|
*/
|
|
|
|
/* this Q func is used by the degraded-mode dag functions to recover lost data.
|
|
* the second-to-last parameter is the PDA for the failed portion of the access.
|
|
* the code here looks at this PDA and assumes that the xor target buffer is
|
|
* equal in size to the number of sectors in the failed PDA. It then uses
|
|
* the other PDAs in the parameter list to determine where within the target
|
|
* buffer the corresponding data should be xored.
|
|
*
|
|
* Recall the basic equation is
|
|
*
|
|
* Q = ( data_1 + 2 * data_2 ... + k * data_k ) mod 256
|
|
*
|
|
* so to recover data_j we need
|
|
*
|
|
* J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
|
|
*
|
|
* So the coefficient for each buffer is (255 - data_col), and j should be initialized by
|
|
* copying Q into it. Then we need to do a table lookup to convert to solve
|
|
* data_j /= J
|
|
*
|
|
*
|
|
*/
|
|
int
|
|
rf_RecoveryQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
|
|
RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
|
|
RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
|
|
int i;
|
|
RF_PhysDiskAddr_t *pda;
|
|
RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
|
|
char *srcbuf, *destbuf;
|
|
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
|
|
RF_Etimer_t timer;
|
|
unsigned long coeff;
|
|
|
|
RF_ETIMER_START(timer);
|
|
/* start by copying Q into the buffer */
|
|
memcpy(node->results[0], node->params[node->numParams - 3].p,
|
|
rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
|
|
for (i = 0; i < node->numParams - 4; i += 2) {
|
|
RF_ASSERT(node->params[i + 1].p != node->results[0]);
|
|
pda = (RF_PhysDiskAddr_t *) node->params[i].p;
|
|
srcbuf = (char *) node->params[i + 1].p;
|
|
suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
|
|
destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
|
|
coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
|
|
/* compute the data unit offset within the column */
|
|
coeff = (coeff % raidPtr->Layout.numDataCol);
|
|
rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
|
|
}
|
|
/* Do the nasty inversion now */
|
|
coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
|
|
rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
|
|
RF_ETIMER_STOP(timer);
|
|
RF_ETIMER_EVAL(timer);
|
|
tracerec->q_us += RF_ETIMER_VAL_US(timer);
|
|
rf_GenericWakeupFunc(node, 0);
|
|
return (0);
|
|
}
|
|
|
|
int
|
|
rf_RecoveryPQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
|
|
printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
|
|
return (1);
|
|
}
|
|
/*
|
|
Degraded write Q subroutine.
|
|
Used when P is dead.
|
|
Large-write style Q computation.
|
|
Parameters
|
|
|
|
(pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
|
|
|
|
We ignore failedPDA.
|
|
|
|
This is a "simple style" recovery func.
|
|
*/
|
|
|
|
void
|
|
rf_PQ_DegradedWriteQFunc(node)
|
|
RF_DagNode_t *node;
|
|
{
|
|
int np = node->numParams;
|
|
int d;
|
|
RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
|
|
unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
|
|
int i;
|
|
RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
|
|
RF_Etimer_t timer;
|
|
char *qbuf = node->results[0];
|
|
char *obuf, *qpbuf;
|
|
RF_PhysDiskAddr_t *old;
|
|
unsigned long coeff;
|
|
int fail_start, j;
|
|
|
|
old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
|
|
fail_start = old->startSector % secPerSU;
|
|
|
|
RF_ETIMER_START(timer);
|
|
|
|
d = (np - 2) / 2;
|
|
RF_ASSERT(2 * d + 2 == np);
|
|
|
|
for (i = 0; i < d; i++) {
|
|
old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
|
|
obuf = (char *) node->params[2 * i + 1].p;
|
|
coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
|
|
/* compute the data unit offset within the column, then add
|
|
* one */
|
|
coeff = (coeff % raidPtr->Layout.numDataCol);
|
|
j = old->startSector % secPerSU;
|
|
RF_ASSERT(j >= fail_start);
|
|
qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
|
|
rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
|
|
}
|
|
|
|
RF_ETIMER_STOP(timer);
|
|
RF_ETIMER_EVAL(timer);
|
|
tracerec->q_us += RF_ETIMER_VAL_US(timer);
|
|
rf_GenericWakeupFunc(node, 0);
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Q computations */
|
|
|
|
/*
|
|
coeff - colummn;
|
|
|
|
compute dest ^= qfor[28-coeff][rn[coeff+1] a]
|
|
|
|
on 5-bit basis;
|
|
length in bytes;
|
|
*/
|
|
|
|
void
|
|
rf_IncQ(dest, buf, length, coeff)
|
|
unsigned long *dest;
|
|
unsigned long *buf;
|
|
unsigned length;
|
|
unsigned coeff;
|
|
{
|
|
unsigned long a, d, new;
|
|
unsigned long a1, a2;
|
|
unsigned int *q = &(rf_qfor[28 - coeff][0]);
|
|
unsigned r = rf_rn[coeff + 1];
|
|
|
|
#define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
|
|
#define INSERT(a,i) (a << (5L*i))
|
|
|
|
length /= 8;
|
|
/* 13 5 bit quants in a 64 bit word */
|
|
while (length) {
|
|
a = *buf++;
|
|
d = *dest;
|
|
a1 = EXTRACT(a, 0) ^ r;
|
|
a2 = EXTRACT(a, 1) ^ r;
|
|
new = INSERT(a2, 1) | a1;
|
|
a1 = EXTRACT(a, 2) ^ r;
|
|
a2 = EXTRACT(a, 3) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1, 2) | INSERT(a2, 3);
|
|
a1 = EXTRACT(a, 4) ^ r;
|
|
a2 = EXTRACT(a, 5) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1, 4) | INSERT(a2, 5);
|
|
a1 = EXTRACT(a, 5) ^ r;
|
|
a2 = EXTRACT(a, 6) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1, 5) | INSERT(a2, 6);
|
|
#if RF_LONGSHIFT > 2
|
|
a1 = EXTRACT(a, 7) ^ r;
|
|
a2 = EXTRACT(a, 8) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1, 7) | INSERT(a2, 8);
|
|
a1 = EXTRACT(a, 9) ^ r;
|
|
a2 = EXTRACT(a, 10) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1, 9) | INSERT(a2, 10);
|
|
a1 = EXTRACT(a, 11) ^ r;
|
|
a2 = EXTRACT(a, 12) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1, 11) | INSERT(a2, 12);
|
|
#endif /* RF_LONGSHIFT > 2 */
|
|
d ^= new;
|
|
*dest++ = d;
|
|
length--;
|
|
}
|
|
}
|
|
/*
|
|
compute
|
|
|
|
dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
|
|
|
|
on a five bit basis.
|
|
optimization: compute old ^ new on 64 bit basis.
|
|
|
|
length in bytes.
|
|
*/
|
|
|
|
static void
|
|
QDelta(
|
|
char *dest,
|
|
char *obuf,
|
|
char *nbuf,
|
|
unsigned length,
|
|
unsigned char coeff)
|
|
{
|
|
unsigned long a, d, new;
|
|
unsigned long a1, a2;
|
|
unsigned int *q = &(rf_qfor[28 - coeff][0]);
|
|
unsigned int r = rf_rn[coeff + 1];
|
|
|
|
r = a1 = a2 = new = d = a = 0; /* XXX for now... */
|
|
q = NULL; /* XXX for now */
|
|
|
|
#ifdef _KERNEL
|
|
/* PQ in kernel currently not supported because the encoding/decoding
|
|
* table is not present */
|
|
memset(dest, 0, length);
|
|
#else /* KERNEL */
|
|
/* this code probably doesn't work and should be rewritten -wvcii */
|
|
/* 13 5 bit quants in a 64 bit word */
|
|
length /= 8;
|
|
while (length) {
|
|
a = *obuf++; /* XXX need to reorg to avoid cache conflicts */
|
|
a ^= *nbuf++;
|
|
d = *dest;
|
|
a1 = EXTRACT(a, 0) ^ r;
|
|
a2 = EXTRACT(a, 1) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = INSERT(a2, 1) | a1;
|
|
a1 = EXTRACT(a, 2) ^ r;
|
|
a2 = EXTRACT(a, 3) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1, 2) | INSERT(a2, 3);
|
|
a1 = EXTRACT(a, 4) ^ r;
|
|
a2 = EXTRACT(a, 5) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1, 4) | INSERT(a2, 5);
|
|
a1 = EXTRACT(a, 5) ^ r;
|
|
a2 = EXTRACT(a, 6) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1, 5) | INSERT(a2, 6);
|
|
#if RF_LONGSHIFT > 2
|
|
a1 = EXTRACT(a, 7) ^ r;
|
|
a2 = EXTRACT(a, 8) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1, 7) | INSERT(a2, 8);
|
|
a1 = EXTRACT(a, 9) ^ r;
|
|
a2 = EXTRACT(a, 10) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1, 9) | INSERT(a2, 10);
|
|
a1 = EXTRACT(a, 11) ^ r;
|
|
a2 = EXTRACT(a, 12) ^ r;
|
|
a1 = q[a1];
|
|
a2 = q[a2];
|
|
new = new | INSERT(a1, 11) | INSERT(a2, 12);
|
|
#endif /* RF_LONGSHIFT > 2 */
|
|
d ^= new;
|
|
*dest++ = d;
|
|
length--;
|
|
}
|
|
#endif /* _KERNEL */
|
|
}
|
|
/*
|
|
recover columns a and b from the given p and q into
|
|
bufs abuf and bbuf. All bufs are word aligned.
|
|
Length is in bytes.
|
|
*/
|
|
|
|
|
|
/*
|
|
* XXX
|
|
*
|
|
* Everything about this seems wrong.
|
|
*/
|
|
void
|
|
rf_PQ_recover(pbuf, qbuf, abuf, bbuf, length, coeff_a, coeff_b)
|
|
unsigned long *pbuf;
|
|
unsigned long *qbuf;
|
|
unsigned long *abuf;
|
|
unsigned long *bbuf;
|
|
unsigned length;
|
|
unsigned coeff_a;
|
|
unsigned coeff_b;
|
|
{
|
|
unsigned long p, q, a, a0, a1;
|
|
int col = (29 * coeff_a) + coeff_b;
|
|
unsigned char *q0 = &(rf_qinv[col][0]);
|
|
|
|
length /= 8;
|
|
while (length) {
|
|
p = *pbuf++;
|
|
q = *qbuf++;
|
|
a0 = EXTRACT(p, 0);
|
|
a1 = EXTRACT(q, 0);
|
|
a = q0[a0 << 5 | a1];
|
|
#define MF(i) \
|
|
a0 = EXTRACT(p,i); \
|
|
a1 = EXTRACT(q,i); \
|
|
a = a | INSERT(q0[a0<<5 | a1],i)
|
|
|
|
MF(1);
|
|
MF(2);
|
|
MF(3);
|
|
MF(4);
|
|
MF(5);
|
|
MF(6);
|
|
#if 0
|
|
MF(7);
|
|
MF(8);
|
|
MF(9);
|
|
MF(10);
|
|
MF(11);
|
|
MF(12);
|
|
#endif /* 0 */
|
|
*abuf++ = a;
|
|
*bbuf++ = a ^ p;
|
|
length--;
|
|
}
|
|
}
|
|
/*
|
|
Lost parity and a data column. Recover that data column.
|
|
Assume col coeff is lost. Let q the contents of Q after
|
|
all surviving data columns have been q-xored out of it.
|
|
Then we have the equation
|
|
|
|
q[28-coeff][a_i ^ r_i+1] = q
|
|
|
|
but q is cyclic with period 31.
|
|
So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
|
|
q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
|
|
|
|
so a_i = r_{coeff+1} ^ q[3+coeff][q]
|
|
|
|
The routine is passed q buffer and the buffer
|
|
the data is to be recoverd into. They can be the same.
|
|
*/
|
|
|
|
|
|
|
|
static void
|
|
rf_InvertQ(
|
|
unsigned long *qbuf,
|
|
unsigned long *abuf,
|
|
unsigned length,
|
|
unsigned coeff)
|
|
{
|
|
unsigned long a, new;
|
|
unsigned long a1, a2;
|
|
unsigned int *q = &(rf_qfor[3 + coeff][0]);
|
|
unsigned r = rf_rn[coeff + 1];
|
|
|
|
/* 13 5 bit quants in a 64 bit word */
|
|
length /= 8;
|
|
while (length) {
|
|
a = *qbuf++;
|
|
a1 = EXTRACT(a, 0);
|
|
a2 = EXTRACT(a, 1);
|
|
a1 = r ^ q[a1];
|
|
a2 = r ^ q[a2];
|
|
new = INSERT(a2, 1) | a1;
|
|
#define M(i,j) \
|
|
a1 = EXTRACT(a,i); \
|
|
a2 = EXTRACT(a,j); \
|
|
a1 = r ^ q[a1]; \
|
|
a2 = r ^ q[a2]; \
|
|
new = new | INSERT(a1,i) | INSERT(a2,j)
|
|
|
|
M(2, 3);
|
|
M(4, 5);
|
|
M(5, 6);
|
|
#if RF_LONGSHIFT > 2
|
|
M(7, 8);
|
|
M(9, 10);
|
|
M(11, 12);
|
|
#endif /* RF_LONGSHIFT > 2 */
|
|
*abuf++ = new;
|
|
length--;
|
|
}
|
|
}
|
|
#endif /* (RF_INCLUDE_DECL_PQ > 0) ||
|
|
* (RF_INCLUDE_RAID6 > 0) */
|