396f9f4598
Reconmap used to have one pointer for every reconstruction unit. This does not scale well in the land of 1TB disks, where some 100MB+ of "status pointers" are required for typical configurations. Convert the reconstruction code to use a "sliding status window" which will scale nicely regardless of the number of stripes/reconstruction units in the RAID set. Convert the main reconstruction loop to rebuild the array in chunks rather than in one big lump. As part of these changes, introduce a function to kick any waiters on the head separation callback list, and use that in the main reconstruction event queue to wake up the waiters if things have stalled. (I believe this may fix a race condition that could occur at at least at the very end of a disk during reconstruction under heavy IO load.) Thanks to Brian Buhrow for all his help, support, and patience in testing these changes.
397 lines
12 KiB
C
397 lines
12 KiB
C
/* $NetBSD: rf_reconmap.c,v 1.31 2008/05/19 19:49:54 oster Exp $ */
|
|
/*
|
|
* Copyright (c) 1995 Carnegie-Mellon University.
|
|
* All rights reserved.
|
|
*
|
|
* Author: Mark Holland
|
|
*
|
|
* Permission to use, copy, modify and distribute this software and
|
|
* its documentation is hereby granted, provided that both the copyright
|
|
* notice and this permission notice appear in all copies of the
|
|
* software, derivative works or modified versions, and any portions
|
|
* thereof, and that both notices appear in supporting documentation.
|
|
*
|
|
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
|
|
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
|
|
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
|
|
*
|
|
* Carnegie Mellon requests users of this software to return to
|
|
*
|
|
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
|
|
* School of Computer Science
|
|
* Carnegie Mellon University
|
|
* Pittsburgh PA 15213-3890
|
|
*
|
|
* any improvements or extensions that they make and grant Carnegie the
|
|
* rights to redistribute these changes.
|
|
*/
|
|
|
|
/*************************************************************************
|
|
* rf_reconmap.c
|
|
*
|
|
* code to maintain a map of what sectors have/have not been reconstructed
|
|
*
|
|
*************************************************************************/
|
|
|
|
#include <sys/cdefs.h>
|
|
__KERNEL_RCSID(0, "$NetBSD: rf_reconmap.c,v 1.31 2008/05/19 19:49:54 oster Exp $");
|
|
|
|
#include "rf_raid.h"
|
|
#include <sys/time.h>
|
|
#include "rf_general.h"
|
|
#include "rf_utils.h"
|
|
|
|
/* special pointer values indicating that a reconstruction unit
|
|
* has been either totally reconstructed or not at all. Both
|
|
* are illegal pointer values, so you have to be careful not to
|
|
* dereference through them. RU_NOTHING must be zero, since
|
|
* MakeReconMap uses memset to initialize the structure. These are used
|
|
* only at the head of the list.
|
|
*/
|
|
#define RU_ALL ((RF_ReconMapListElem_t *) -1)
|
|
#define RU_NOTHING ((RF_ReconMapListElem_t *) 0)
|
|
|
|
/* For most reconstructs we need at most 3 RF_ReconMapListElem_t's.
|
|
* Bounding the number we need is quite difficult, as it depends on how
|
|
* badly the sectors to be reconstructed get divided up. In the current
|
|
* code, the reconstructed sectors appeared aligned on stripe boundaries,
|
|
* and are always presented in stripe width units, so we're probably
|
|
* allocating quite a bit more than we'll ever need.
|
|
*/
|
|
#define RF_NUM_RECON_POOL_ELEM 100
|
|
|
|
static void
|
|
compact_stat_entry(RF_Raid_t *, RF_ReconMap_t *, int, int);
|
|
static void crunch_list(RF_ReconMap_t *, RF_ReconMapListElem_t *);
|
|
static RF_ReconMapListElem_t *
|
|
MakeReconMapListElem(RF_ReconMap_t *, RF_SectorNum_t, RF_SectorNum_t,
|
|
RF_ReconMapListElem_t *);
|
|
static void
|
|
FreeReconMapListElem(RF_ReconMap_t *mapPtr, RF_ReconMapListElem_t * p);
|
|
|
|
/*---------------------------------------------------------------------------
|
|
*
|
|
* Creates and initializes new Reconstruction map
|
|
*
|
|
* ru_sectors - size of reconstruction unit in sectors
|
|
* disk_sectors - size of disk in sectors
|
|
* spareUnitsPerDisk - zero unless distributed sparing
|
|
*-------------------------------------------------------------------------*/
|
|
|
|
RF_ReconMap_t *
|
|
rf_MakeReconMap(RF_Raid_t *raidPtr, RF_SectorCount_t ru_sectors,
|
|
RF_SectorCount_t disk_sectors,
|
|
RF_ReconUnitCount_t spareUnitsPerDisk)
|
|
{
|
|
RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
|
|
RF_ReconUnitCount_t num_rus = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerRU;
|
|
RF_ReconMap_t *p;
|
|
|
|
RF_Malloc(p, sizeof(RF_ReconMap_t), (RF_ReconMap_t *));
|
|
p->sectorsPerReconUnit = ru_sectors;
|
|
p->sectorsInDisk = disk_sectors;
|
|
|
|
p->totalRUs = num_rus;
|
|
p->spareRUs = spareUnitsPerDisk;
|
|
p->unitsLeft = num_rus - spareUnitsPerDisk;
|
|
p->low_ru = 0;
|
|
p->status_size = RF_RECONMAP_SIZE;
|
|
p->high_ru = p->status_size - 1;
|
|
p->head = 0;
|
|
|
|
RF_Malloc(p->status, p->status_size * sizeof(RF_ReconMapListElem_t *), (RF_ReconMapListElem_t **));
|
|
RF_ASSERT(p->status != (RF_ReconMapListElem_t **) NULL);
|
|
|
|
(void) memset((char *) p->status, 0,
|
|
p->status_size * sizeof(RF_ReconMapListElem_t *));
|
|
|
|
pool_init(&p->elem_pool, sizeof(RF_ReconMapListElem_t), 0,
|
|
0, 0, "raidreconpl", NULL, IPL_BIO);
|
|
pool_prime(&p->elem_pool, RF_NUM_RECON_POOL_ELEM);
|
|
|
|
rf_mutex_init(&p->mutex);
|
|
return (p);
|
|
}
|
|
|
|
|
|
/*---------------------------------------------------------------------------
|
|
*
|
|
* marks a new set of sectors as reconstructed. All the possible
|
|
* mergings get complicated. To simplify matters, the approach I take
|
|
* is to just dump something into the list, and then clean it up
|
|
* (i.e. merge elements and eliminate redundant ones) in a second pass
|
|
* over the list (compact_stat_entry()). Not 100% efficient, since a
|
|
* structure can be allocated and then immediately freed, but it keeps
|
|
* this code from becoming (more of) a nightmare of special cases.
|
|
* The only thing that compact_stat_entry() assumes is that the list
|
|
* is sorted by startSector, and so this is the only condition I
|
|
* maintain here. (MCH)
|
|
*
|
|
* This code now uses a pool instead of the previous malloc/free
|
|
* stuff.
|
|
*-------------------------------------------------------------------------*/
|
|
|
|
void
|
|
rf_ReconMapUpdate(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr,
|
|
RF_SectorNum_t startSector, RF_SectorNum_t stopSector)
|
|
{
|
|
RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit;
|
|
RF_SectorNum_t i, first_in_RU, last_in_RU, ru;
|
|
RF_ReconMapListElem_t *p, *pt;
|
|
|
|
RF_LOCK_MUTEX(mapPtr->mutex);
|
|
while(mapPtr->lock) {
|
|
ltsleep(&mapPtr->lock, PRIBIO, "reconupdate", 0,
|
|
&mapPtr->mutex);
|
|
}
|
|
mapPtr->lock = 1;
|
|
RF_UNLOCK_MUTEX(mapPtr->mutex);
|
|
RF_ASSERT(startSector >= 0 && stopSector < mapPtr->sectorsInDisk &&
|
|
stopSector >= startSector);
|
|
|
|
while (startSector <= stopSector) {
|
|
i = startSector / mapPtr->sectorsPerReconUnit;
|
|
first_in_RU = i * sectorsPerReconUnit;
|
|
last_in_RU = first_in_RU + sectorsPerReconUnit - 1;
|
|
|
|
/* do we need to move the queue? */
|
|
while (i > mapPtr->high_ru) {
|
|
#ifdef DIAGNOSTIC
|
|
if (mapPtr->status[mapPtr->head]!=RU_ALL) {
|
|
printf("\nraid%d: reconmap incorrect -- working on i %" PRIu64 "\n",
|
|
raidPtr->raidid, i);
|
|
printf("raid%d: ru %" PRIu64 " not completed!!!\n",
|
|
raidPtr->raidid, mapPtr->head);
|
|
|
|
printf("raid%d: low: %" PRIu64 " high: %" PRIu64 "\n",
|
|
raidPtr->raidid, mapPtr->low_ru, mapPtr->high_ru);
|
|
|
|
panic("reconmap incorrect");
|
|
}
|
|
#endif
|
|
mapPtr->low_ru++;
|
|
mapPtr->high_ru++;
|
|
/* initialize "highest" RU status entry, which
|
|
will take over the current head postion */
|
|
mapPtr->status[mapPtr->head]=RU_NOTHING;
|
|
|
|
/* move head too */
|
|
mapPtr->head++;
|
|
if (mapPtr->head >= mapPtr->status_size)
|
|
mapPtr->head = 0;
|
|
|
|
}
|
|
|
|
ru = i - mapPtr->low_ru + mapPtr->head;
|
|
if (ru >= mapPtr->status_size)
|
|
ru = ru - mapPtr->status_size;
|
|
|
|
if ((ru < 0) || (ru >= mapPtr->status_size)) {
|
|
printf("raid%d: ru is bogus %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "\n",
|
|
raidPtr->raidid, i, ru, mapPtr->head, mapPtr->low_ru, mapPtr->high_ru);
|
|
panic("bogus ru in reconmap");
|
|
}
|
|
|
|
p = mapPtr->status[ru];
|
|
if (p != RU_ALL) {
|
|
if (p == RU_NOTHING || p->startSector > startSector) {
|
|
/* insert at front of list */
|
|
|
|
mapPtr->status[ru] = MakeReconMapListElem(mapPtr,startSector, RF_MIN(stopSector, last_in_RU), (p == RU_NOTHING) ? NULL : p);
|
|
|
|
} else {/* general case */
|
|
do { /* search for place to insert */
|
|
pt = p;
|
|
p = p->next;
|
|
} while (p && (p->startSector < startSector));
|
|
pt->next = MakeReconMapListElem(mapPtr,startSector, RF_MIN(stopSector, last_in_RU), p);
|
|
|
|
}
|
|
compact_stat_entry(raidPtr, mapPtr, i, ru);
|
|
}
|
|
startSector = RF_MIN(stopSector, last_in_RU) + 1;
|
|
}
|
|
RF_LOCK_MUTEX(mapPtr->mutex);
|
|
mapPtr->lock = 0;
|
|
wakeup(&mapPtr->lock);
|
|
RF_UNLOCK_MUTEX(mapPtr->mutex);
|
|
}
|
|
|
|
|
|
|
|
/*---------------------------------------------------------------------------
|
|
*
|
|
* performs whatever list compactions can be done, and frees any space
|
|
* that is no longer necessary. Assumes only that the list is sorted
|
|
* by startSector. crunch_list() compacts a single list as much as
|
|
* possible, and the second block of code deletes the entire list if
|
|
* possible. crunch_list() is also called from
|
|
* MakeReconMapAccessList().
|
|
*
|
|
* When a recon unit is detected to be fully reconstructed, we set the
|
|
* corresponding bit in the parity stripe map so that the head follow
|
|
* code will not select this parity stripe again. This is redundant
|
|
* (but harmless) when compact_stat_entry is called from the
|
|
* reconstruction code, but necessary when called from the user-write
|
|
* code.
|
|
*
|
|
*-------------------------------------------------------------------------*/
|
|
|
|
static void
|
|
compact_stat_entry(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr, int i, int j)
|
|
{
|
|
RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit;
|
|
RF_ReconMapListElem_t *p = mapPtr->status[j];
|
|
|
|
crunch_list(mapPtr, p);
|
|
|
|
if ((p->startSector == i * sectorsPerReconUnit) &&
|
|
(p->stopSector == i * sectorsPerReconUnit +
|
|
sectorsPerReconUnit - 1)) {
|
|
mapPtr->status[j] = RU_ALL;
|
|
mapPtr->unitsLeft--;
|
|
FreeReconMapListElem(mapPtr, p);
|
|
}
|
|
}
|
|
|
|
|
|
static void
|
|
crunch_list(RF_ReconMap_t *mapPtr, RF_ReconMapListElem_t *listPtr)
|
|
{
|
|
RF_ReconMapListElem_t *pt, *p = listPtr;
|
|
|
|
if (!p)
|
|
return;
|
|
pt = p;
|
|
p = p->next;
|
|
while (p) {
|
|
if (pt->stopSector >= p->startSector - 1) {
|
|
pt->stopSector = RF_MAX(pt->stopSector, p->stopSector);
|
|
pt->next = p->next;
|
|
FreeReconMapListElem(mapPtr, p);
|
|
p = pt->next;
|
|
} else {
|
|
pt = p;
|
|
p = p->next;
|
|
}
|
|
}
|
|
}
|
|
/*---------------------------------------------------------------------------
|
|
*
|
|
* Allocate and fill a new list element
|
|
*
|
|
*-------------------------------------------------------------------------*/
|
|
|
|
static RF_ReconMapListElem_t *
|
|
MakeReconMapListElem(RF_ReconMap_t *mapPtr, RF_SectorNum_t startSector,
|
|
RF_SectorNum_t stopSector, RF_ReconMapListElem_t *next)
|
|
{
|
|
RF_ReconMapListElem_t *p;
|
|
|
|
p = pool_get(&mapPtr->elem_pool, PR_WAITOK);
|
|
p->startSector = startSector;
|
|
p->stopSector = stopSector;
|
|
p->next = next;
|
|
return (p);
|
|
}
|
|
/*---------------------------------------------------------------------------
|
|
*
|
|
* Free a list element
|
|
*
|
|
*-------------------------------------------------------------------------*/
|
|
|
|
static void
|
|
FreeReconMapListElem(RF_ReconMap_t *mapPtr, RF_ReconMapListElem_t *p)
|
|
{
|
|
pool_put(&mapPtr->elem_pool, p);
|
|
}
|
|
/*---------------------------------------------------------------------------
|
|
*
|
|
* Free an entire status structure. Inefficient, but can be called at
|
|
* any time.
|
|
*
|
|
*-------------------------------------------------------------------------*/
|
|
void
|
|
rf_FreeReconMap(RF_ReconMap_t *mapPtr)
|
|
{
|
|
RF_ReconMapListElem_t *p, *q;
|
|
RF_ReconUnitCount_t numRUs;
|
|
RF_ReconUnitNum_t i;
|
|
|
|
numRUs = mapPtr->sectorsInDisk / mapPtr->sectorsPerReconUnit;
|
|
if (mapPtr->sectorsInDisk % mapPtr->sectorsPerReconUnit)
|
|
numRUs++;
|
|
|
|
for (i = 0; i < mapPtr->status_size; i++) {
|
|
p = mapPtr->status[i];
|
|
while (p != RU_NOTHING && p != RU_ALL) {
|
|
q = p;
|
|
p = p->next;
|
|
RF_Free(q, sizeof(*q));
|
|
}
|
|
}
|
|
|
|
pool_destroy(&mapPtr->elem_pool);
|
|
RF_Free(mapPtr->status, mapPtr->status_size *
|
|
sizeof(RF_ReconMapListElem_t *));
|
|
RF_Free(mapPtr, sizeof(RF_ReconMap_t));
|
|
}
|
|
/*---------------------------------------------------------------------------
|
|
*
|
|
* returns nonzero if the indicated RU has been reconstructed already
|
|
*
|
|
*-------------------------------------------------------------------------*/
|
|
|
|
int
|
|
rf_CheckRUReconstructed(RF_ReconMap_t *mapPtr, RF_SectorNum_t startSector)
|
|
{
|
|
RF_ReconUnitNum_t i;
|
|
int rv;
|
|
|
|
i = startSector / mapPtr->sectorsPerReconUnit;
|
|
|
|
if (i < mapPtr->low_ru)
|
|
rv = 1;
|
|
else if (i > mapPtr->high_ru)
|
|
rv = 0;
|
|
else {
|
|
i = i - mapPtr->low_ru + mapPtr->head;
|
|
if (i >= mapPtr->status_size)
|
|
i = i - mapPtr->status_size;
|
|
if (mapPtr->status[i] == RU_ALL)
|
|
rv = 1;
|
|
else
|
|
rv = 0;
|
|
}
|
|
|
|
return rv;
|
|
}
|
|
|
|
RF_ReconUnitCount_t
|
|
rf_UnitsLeftToReconstruct(RF_ReconMap_t *mapPtr)
|
|
{
|
|
RF_ASSERT(mapPtr != NULL);
|
|
return (mapPtr->unitsLeft);
|
|
}
|
|
|
|
#if RF_DEBUG_RECON
|
|
void
|
|
rf_PrintReconSchedule(RF_ReconMap_t *mapPtr, struct timeval *starttime)
|
|
{
|
|
static int old_pctg = -1;
|
|
struct timeval tv, diff;
|
|
int new_pctg;
|
|
|
|
new_pctg = 100 - (rf_UnitsLeftToReconstruct(mapPtr) *
|
|
100 / mapPtr->totalRUs);
|
|
if (new_pctg != old_pctg) {
|
|
RF_GETTIME(tv);
|
|
RF_TIMEVAL_DIFF(starttime, &tv, &diff);
|
|
printf("%d %d.%06d\n", (int) new_pctg, (int) diff.tv_sec,
|
|
(int) diff.tv_usec);
|
|
old_pctg = new_pctg;
|
|
}
|
|
}
|
|
#endif
|
|
|