433 lines
14 KiB
C
433 lines
14 KiB
C
/* $NetBSD: rf_raid.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */
|
|
/*
|
|
* Copyright (c) 1995 Carnegie-Mellon University.
|
|
* All rights reserved.
|
|
*
|
|
* Author: Mark Holland
|
|
*
|
|
* Permission to use, copy, modify and distribute this software and
|
|
* its documentation is hereby granted, provided that both the copyright
|
|
* notice and this permission notice appear in all copies of the
|
|
* software, derivative works or modified versions, and any portions
|
|
* thereof, and that both notices appear in supporting documentation.
|
|
*
|
|
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
|
|
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
|
|
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
|
|
*
|
|
* Carnegie Mellon requests users of this software to return to
|
|
*
|
|
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
|
|
* School of Computer Science
|
|
* Carnegie Mellon University
|
|
* Pittsburgh PA 15213-3890
|
|
*
|
|
* any improvements or extensions that they make and grant Carnegie the
|
|
* rights to redistribute these changes.
|
|
*/
|
|
|
|
/**********************************************
|
|
* rf_raid.h -- main header file for RAID driver
|
|
**********************************************/
|
|
|
|
/*
|
|
* :
|
|
* Log: rf_raid.h,v
|
|
* Revision 1.48 1996/08/20 22:33:54 jimz
|
|
* make hist_diskreq a doubly-indexed array
|
|
*
|
|
* Revision 1.47 1996/07/15 05:40:41 jimz
|
|
* some recon datastructure cleanup
|
|
* better handling of multiple failures
|
|
* added undocumented double-recon test
|
|
*
|
|
* Revision 1.46 1996/07/10 22:28:51 jimz
|
|
* get rid of obsolete row statuses (dead,degraded2)
|
|
*
|
|
* Revision 1.45 1996/06/14 14:56:29 jimz
|
|
* make engine threading stuff ifndef SIMULATE
|
|
*
|
|
* Revision 1.44 1996/06/14 14:16:54 jimz
|
|
* move in engine node queue, atomicity control
|
|
*
|
|
* Revision 1.43 1996/06/12 04:41:26 jimz
|
|
* tweaks to make genplot work with user-level driver
|
|
* (mainly change stat collection)
|
|
*
|
|
* Revision 1.42 1996/06/11 10:57:17 jimz
|
|
* add recon_done_procs, recon_done_proc_mutex
|
|
*
|
|
* Revision 1.41 1996/06/11 01:26:48 jimz
|
|
* added mechanism for user-level to sync diskthread startup,
|
|
* shutdown
|
|
*
|
|
* Revision 1.40 1996/06/10 14:18:58 jimz
|
|
* move user, throughput stats into per-array structure
|
|
*
|
|
* Revision 1.39 1996/06/10 11:55:47 jimz
|
|
* Straightened out some per-array/not-per-array distinctions, fixed
|
|
* a couple bugs related to confusion. Added shutdown lists. Removed
|
|
* layout shutdown function (now subsumed by shutdown lists).
|
|
*
|
|
* Revision 1.38 1996/06/07 21:33:04 jimz
|
|
* begin using consistent types for sector numbers,
|
|
* stripe numbers, row+col numbers, recon unit numbers
|
|
*
|
|
* Revision 1.37 1996/06/05 19:38:32 jimz
|
|
* fixed up disk queueing types config
|
|
* added sstf disk queueing
|
|
* fixed exit bug on diskthreads (ref-ing bad mem)
|
|
*
|
|
* Revision 1.36 1996/06/05 18:06:02 jimz
|
|
* Major code cleanup. The Great Renaming is now done.
|
|
* Better modularity. Better typing. Fixed a bunch of
|
|
* synchronization bugs. Made a lot of global stuff
|
|
* per-desc or per-array. Removed dead code.
|
|
*
|
|
* Revision 1.35 1996/06/03 23:28:26 jimz
|
|
* more bugfixes
|
|
* check in tree to sync for IPDS runs with current bugfixes
|
|
* there still may be a problem with threads in the script test
|
|
* getting I/Os stuck- not trivially reproducible (runs ~50 times
|
|
* in a row without getting stuck)
|
|
*
|
|
* Revision 1.34 1996/06/02 17:31:48 jimz
|
|
* Moved a lot of global stuff into array structure, where it belongs.
|
|
* Fixed up paritylogging, pss modules in this manner. Some general
|
|
* code cleanup. Removed lots of dead code, some dead files.
|
|
*
|
|
* Revision 1.33 1996/05/30 23:22:16 jimz
|
|
* bugfixes of serialization, timing problems
|
|
* more cleanup
|
|
*
|
|
* Revision 1.32 1996/05/30 11:29:41 jimz
|
|
* Numerous bug fixes. Stripe lock release code disagreed with the taking code
|
|
* about when stripes should be locked (I made it consistent: no parity, no lock)
|
|
* There was a lot of extra serialization of I/Os which I've removed- a lot of
|
|
* it was to calculate values for the cache code, which is no longer with us.
|
|
* More types, function, macro cleanup. Added code to properly quiesce the array
|
|
* on shutdown. Made a lot of stuff array-specific which was (bogusly) general
|
|
* before. Fixed memory allocation, freeing bugs.
|
|
*
|
|
* Revision 1.31 1996/05/27 18:56:37 jimz
|
|
* more code cleanup
|
|
* better typing
|
|
* compiles in all 3 environments
|
|
*
|
|
* Revision 1.30 1996/05/24 22:17:04 jimz
|
|
* continue code + namespace cleanup
|
|
* typed a bunch of flags
|
|
*
|
|
* Revision 1.29 1996/05/23 21:46:35 jimz
|
|
* checkpoint in code cleanup (release prep)
|
|
* lots of types, function names have been fixed
|
|
*
|
|
* Revision 1.28 1996/05/23 00:33:23 jimz
|
|
* code cleanup: move all debug decls to rf_options.c, all extern
|
|
* debug decls to rf_options.h, all debug vars preceded by rf_
|
|
*
|
|
* Revision 1.27 1996/05/18 19:51:34 jimz
|
|
* major code cleanup- fix syntax, make some types consistent,
|
|
* add prototypes, clean out dead code, et cetera
|
|
*
|
|
* Revision 1.26 1996/05/08 21:01:24 jimz
|
|
* fixed up enum type names that were conflicting with other
|
|
* enums and function names (ie, "panic")
|
|
* future naming trends will be towards RF_ and rf_ for
|
|
* everything raidframe-related
|
|
*
|
|
* Revision 1.25 1996/05/02 14:57:55 jimz
|
|
* add sectorMask
|
|
*
|
|
* Revision 1.24 1996/04/22 15:53:13 jimz
|
|
* MAX_RAIDS -> NRAIDFRAME
|
|
*
|
|
* Revision 1.23 1995/12/14 18:39:46 jimz
|
|
* convert to rf_types.h types
|
|
*
|
|
* Revision 1.22 1995/12/06 15:02:26 root
|
|
* added copyright info
|
|
*
|
|
* Revision 1.21 1995/10/09 17:39:24 jimz
|
|
* added info for tracking number of outstanding accesses
|
|
* at user-level
|
|
*
|
|
* Revision 1.20 1995/09/30 20:37:46 jimz
|
|
* added acc_totals to Raid for kernel
|
|
*
|
|
* Revision 1.19 1995/09/19 22:57:14 jimz
|
|
* add cache of raidid for kernel
|
|
*
|
|
* Revision 1.18 1995/09/18 16:50:04 jimz
|
|
* added RF_MAX_DISKS (for config ioctls)
|
|
*
|
|
* Revision 1.17 1995/09/07 19:02:31 jimz
|
|
* mods to get raidframe to compile and link
|
|
* in kernel environment
|
|
*
|
|
* Revision 1.16 1995/07/21 19:29:51 robby
|
|
* added some info for the idler to the Raid
|
|
*
|
|
* Revision 1.15 1995/07/16 03:19:14 cfb
|
|
* added cachePtr to *raidPtr
|
|
*
|
|
* Revision 1.14 1995/06/23 13:39:36 robby
|
|
* updeated to prototypes in rf_layout.h
|
|
*
|
|
*/
|
|
|
|
#ifndef _RF__RF_RAID_H_
|
|
#define _RF__RF_RAID_H_
|
|
|
|
#ifdef _KERNEL
|
|
#define KERNEL
|
|
#endif
|
|
|
|
#include "rf_archs.h"
|
|
#include "rf_types.h"
|
|
#include "rf_threadstuff.h"
|
|
|
|
#if defined(__NetBSD__) && defined(_KERNEL)
|
|
#include "rf_netbsd.h"
|
|
#endif
|
|
|
|
#ifdef KERNEL
|
|
/* XXX Needs to be added. GO
|
|
#include <raidframe.h>
|
|
*/
|
|
#include <sys/disklabel.h>
|
|
#else /* KERNEL */
|
|
#include <stdio.h>
|
|
#include <assert.h>
|
|
#endif /* KERNEL */
|
|
#include <sys/types.h>
|
|
|
|
#include "rf_alloclist.h"
|
|
#include "rf_stripelocks.h"
|
|
#include "rf_layout.h"
|
|
#include "rf_disks.h"
|
|
#include "rf_debugMem.h"
|
|
#include "rf_diskqueue.h"
|
|
#include "rf_reconstruct.h"
|
|
#include "rf_acctrace.h"
|
|
|
|
#if RF_INCLUDE_PARITYLOGGING > 0
|
|
#include "rf_paritylog.h"
|
|
#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
|
|
|
|
#define RF_MAX_DISKS 128 /* max disks per array */
|
|
#ifdef __NetBSD__
|
|
#define RF_DEV2RAIDID(_dev) (DISKUNIT(_dev))
|
|
#else
|
|
#define RF_DEV2RAIDID(_dev) (minor(_dev)>>6) /* convert dev_t to raid id */
|
|
#endif
|
|
|
|
/*
|
|
* Each row in the array is a distinct parity group, so
|
|
* each has it's own status, which is one of the following.
|
|
*/
|
|
typedef enum RF_RowStatus_e {
|
|
rf_rs_optimal,
|
|
rf_rs_degraded,
|
|
rf_rs_reconstructing,
|
|
rf_rs_reconfigured
|
|
} RF_RowStatus_t;
|
|
|
|
struct RF_CumulativeStats_s {
|
|
struct timeval start; /* the time when the stats were last started*/
|
|
struct timeval stop; /* the time when the stats were last stopped */
|
|
long sum_io_us; /* sum of all user response times (us) */
|
|
long num_ios; /* total number of I/Os serviced */
|
|
long num_sect_moved; /* total number of sectors read or written */
|
|
};
|
|
|
|
struct RF_ThroughputStats_s {
|
|
RF_DECLARE_MUTEX(mutex)/* a mutex used to lock the configuration stuff */
|
|
struct timeval start; /* timer started when numOutstandingRequests moves from 0 to 1 */
|
|
struct timeval stop; /* timer stopped when numOutstandingRequests moves from 1 to 0 */
|
|
RF_uint64 sum_io_us; /* total time timer is enabled */
|
|
RF_uint64 num_ios; /* total number of ios processed by RAIDframe */
|
|
long num_out_ios; /* number of outstanding ios */
|
|
};
|
|
|
|
#ifdef SIMULATE
|
|
typedef struct RF_PendingRecon_s RF_PendingRecon_t;
|
|
struct RF_PendingRecon_s {
|
|
RF_RowCol_t row;
|
|
RF_RowCol_t col;
|
|
RF_PendingRecon_t *next;
|
|
};
|
|
#endif /* SIMULATE */
|
|
|
|
struct RF_Raid_s {
|
|
/* This portion never changes, and can be accessed without locking */
|
|
/* an exception is Disks[][].status, which requires locking when it is changed */
|
|
u_int numRow; /* number of rows of disks, typically == # of ranks */
|
|
u_int numCol; /* number of columns of disks, typically == # of disks/rank */
|
|
u_int numSpare; /* number of spare disks */
|
|
int maxQueueDepth; /* max disk queue depth */
|
|
RF_SectorCount_t totalSectors; /* total number of sectors in the array */
|
|
RF_SectorCount_t sectorsPerDisk; /* number of sectors on each disk */
|
|
u_int logBytesPerSector; /* base-2 log of the number of bytes in a sector */
|
|
u_int bytesPerSector; /* bytes in a sector */
|
|
RF_int32 sectorMask; /* mask of bytes-per-sector */
|
|
|
|
RF_RaidLayout_t Layout; /* all information related to layout */
|
|
RF_RaidDisk_t **Disks; /* all information related to physical disks */
|
|
RF_DiskQueue_t **Queues; /* all information related to disk queues */
|
|
/* NOTE: This is an anchor point via which the queues can be accessed,
|
|
* but the enqueue/dequeue routines in diskqueue.c use a local copy of
|
|
* this pointer for the actual accesses.
|
|
*/
|
|
/* The remainder of the structure can change, and therefore requires locking on reads and updates */
|
|
RF_DECLARE_MUTEX(mutex) /* mutex used to serialize access to the fields below */
|
|
RF_RowStatus_t *status; /* the status of each row in the array */
|
|
int valid; /* indicates successful configuration */
|
|
RF_LockTableEntry_t *lockTable; /* stripe-lock table */
|
|
RF_LockTableEntry_t *quiesceLock; /* quiesnce table */
|
|
int numFailures; /* total number of failures in the array */
|
|
|
|
/*
|
|
* Cleanup stuff
|
|
*/
|
|
RF_ShutdownList_t *shutdownList; /* shutdown activities */
|
|
RF_AllocListElem_t *cleanupList; /* memory to be freed at shutdown time */
|
|
|
|
/*
|
|
* Recon stuff
|
|
*/
|
|
RF_HeadSepLimit_t headSepLimit;
|
|
int numFloatingReconBufs;
|
|
int reconInProgress;
|
|
#ifdef SIMULATE
|
|
RF_PendingRecon_t *pendingRecon;
|
|
#endif /* SIMULATE */
|
|
RF_DECLARE_COND(waitForReconCond)
|
|
RF_RaidReconDesc_t *reconDesc; /* reconstruction descriptor */
|
|
RF_ReconCtrl_t **reconControl; /* reconstruction control structure pointers for each row in the array */
|
|
|
|
#if !defined(KERNEL) && !defined(SIMULATE)
|
|
/*
|
|
* Disk thread stuff
|
|
*/
|
|
int diskthreads_created;
|
|
int diskthreads_running;
|
|
int diskthreads_shutdown;
|
|
RF_DECLARE_MUTEX(diskthread_count_mutex)
|
|
RF_DECLARE_COND(diskthread_count_cond)
|
|
#endif /* !KERNEL && !SIMULATE */
|
|
|
|
/*
|
|
* Array-quiescence stuff
|
|
*/
|
|
RF_DECLARE_MUTEX(access_suspend_mutex)
|
|
RF_DECLARE_COND(quiescent_cond)
|
|
RF_IoCount_t accesses_suspended;
|
|
RF_IoCount_t accs_in_flight;
|
|
int access_suspend_release;
|
|
int waiting_for_quiescence;
|
|
RF_CallbackDesc_t *quiesce_wait_list;
|
|
|
|
/*
|
|
* Statistics
|
|
*/
|
|
#if !defined(KERNEL) && !defined(SIMULATE)
|
|
RF_ThroughputStats_t throughputstats;
|
|
#endif /* !KERNEL && !SIMULATE */
|
|
RF_CumulativeStats_t userstats;
|
|
|
|
/*
|
|
* Engine thread control
|
|
*/
|
|
RF_DECLARE_MUTEX(node_queue_mutex)
|
|
RF_DECLARE_COND(node_queue_cond)
|
|
RF_DagNode_t *node_queue;
|
|
#ifndef SIMULATE
|
|
RF_Thread_t engine_thread;
|
|
RF_ThreadGroup_t engine_tg;
|
|
#endif /* !SIMULATE */
|
|
int shutdown_engine;
|
|
int dags_in_flight; /* debug */
|
|
|
|
/*
|
|
* PSS (Parity Stripe Status) stuff
|
|
*/
|
|
RF_FreeList_t *pss_freelist;
|
|
long pssTableSize;
|
|
|
|
/*
|
|
* Reconstruction stuff
|
|
*/
|
|
int procsInBufWait;
|
|
int numFullReconBuffers;
|
|
RF_AccTraceEntry_t *recon_tracerecs;
|
|
unsigned long accumXorTimeUs;
|
|
RF_ReconDoneProc_t *recon_done_procs;
|
|
RF_DECLARE_MUTEX(recon_done_proc_mutex)
|
|
|
|
#if !defined(KERNEL) && !defined(SIMULATE)
|
|
RF_Thread_t **diskthreads, *sparediskthreads; /* thread descriptors for disk threads in user-level version */
|
|
#endif /* !KERNEL && !SIMULATE */
|
|
|
|
/*
|
|
* nAccOutstanding, waitShutdown protected by desc freelist lock
|
|
* (This may seem strange, since that's a central serialization point
|
|
* for a per-array piece of data, but otherwise, it'd be an extra
|
|
* per-array lock, and that'd only be less efficient...)
|
|
*/
|
|
RF_DECLARE_COND(outstandingCond)
|
|
int waitShutdown;
|
|
int nAccOutstanding;
|
|
|
|
RF_DiskId_t **diskids;
|
|
RF_DiskId_t *sparediskids;
|
|
|
|
#ifdef KERNEL
|
|
int raidid;
|
|
#endif /* KERNEL */
|
|
RF_AccTotals_t acc_totals;
|
|
int keep_acc_totals;
|
|
|
|
#ifdef _KERNEL
|
|
struct raidcinfo **raid_cinfo; /* array of component info */
|
|
struct proc *proc; /* XXX shouldn't be needed here.. :-p */
|
|
#endif
|
|
|
|
int terminate_disk_queues;
|
|
|
|
/*
|
|
* XXX
|
|
*
|
|
* config-specific information should be moved
|
|
* somewhere else, or at least hung off this
|
|
* in some generic way
|
|
*/
|
|
|
|
/* used by rf_compute_workload_shift */
|
|
RF_RowCol_t hist_diskreq[RF_MAXROW][RF_MAXCOL];
|
|
|
|
/* used by declustering */
|
|
int noRotate;
|
|
|
|
#if RF_INCLUDE_PARITYLOGGING > 0
|
|
/* used by parity logging */
|
|
RF_SectorCount_t regionLogCapacity;
|
|
RF_ParityLogQueue_t parityLogPool; /* pool of unused parity logs */
|
|
RF_RegionInfo_t *regionInfo; /* array of region state */
|
|
int numParityLogs;
|
|
int numSectorsPerLog;
|
|
int regionParityRange;
|
|
int logsInUse; /* debugging */
|
|
RF_ParityLogDiskQueue_t parityLogDiskQueue; /* state of parity logging disk work */
|
|
RF_RegionBufferQueue_t regionBufferPool; /* buffers for holding region log */
|
|
RF_RegionBufferQueue_t parityBufferPool; /* buffers for holding parity */
|
|
caddr_t parityLogBufferHeap; /* pool of unused parity logs */
|
|
#ifndef SIMULATE
|
|
RF_Thread_t pLogDiskThreadHandle;
|
|
#endif /* !SIMULATE */
|
|
|
|
#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
|
|
};
|
|
|
|
#endif /* !_RF__RF_RAID_H_ */
|