NetBSD/sys/dev/vinum/vinumrevive.c

/*-
 * Copyright (c) 1997, 1998, 1999
 *	Nan Yang Computer Services Limited.  All rights reserved.
 *
 *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
 *
 *  Written by Greg Lehey
 *
 *  This software is distributed under the so-called ``Berkeley
 *  License'':
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by Nan Yang Computer
 *      Services Limited.
 * 4. Neither the name of the Company nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * This software is provided ``as is'', and any express or implied
 * warranties, including, but not limited to, the implied warranties of
 * merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall the company or contributors be liable for any
 * direct, indirect, incidental, special, exemplary, or consequential
 * damages (including, but not limited to, procurement of substitute
 * goods or services; loss of use, data, or profits; or business
 * interruption) however caused and on any theory of liability, whether
 * in contract, strict liability, or tort (including negligence or
 * otherwise) arising in any way out of the use of this software, even if
 * advised of the possibility of such damage.
 *
 * $Id: vinumrevive.c,v 1.3 2004/11/08 21:30:05 explorer Exp $
 * $FreeBSD$
 */

#include <dev/vinum/vinumhdr.h>
#include <dev/vinum/request.h>

/*
 * Revive a block of a subdisk.  Return an error
 * indication.  EAGAIN means successful copy, but
 * that more blocks remain to be copied.  EINVAL
 * means that the subdisk isn't associated with a
 * plex (which means a programming error if we get
 * here at all; FIXME).
 */

int
revive_block(int sdno)
{
    int s;						    /* priority level */
    struct sd *sd;
    struct plex *plex;
    struct volume *vol;
    struct buf *bp;
    int error = EAGAIN;
    int size;						    /* size of revive block, bytes */
    daddr_t plexblkno;					    /* lblkno in plex */
    int psd;						    /* parity subdisk number */
    u_int64_t stripe;					    /* stripe number */
    int paritysd = 0;					    /* set if this is the parity stripe */
    struct rangelock *lock;				    /* for locking */
    daddr_t stripeoffset;				    /* offset in stripe */

    plexblkno = 0;					    /* to keep the compiler happy */
    sd = &SD[sdno];
    lock = NULL;
    if (sd->plexno < 0)					    /* no plex? */
	return EINVAL;
    plex = &PLEX[sd->plexno];				    /* point to plex */
    if (plex->volno >= 0)
	vol = &VOL[plex->volno];
    else
	vol = NULL;

    if ((sd->revive_blocksize == 0)			    /* no block size */
    ||(sd->revive_blocksize & ((1 << DEV_BSHIFT) - 1)))	    /* or invalid block size */
	sd->revive_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
    else if (sd->revive_blocksize > MAX_REVIVE_BLOCKSIZE)
	sd->revive_blocksize = MAX_REVIVE_BLOCKSIZE;
    size = min(sd->revive_blocksize >> DEV_BSHIFT, sd->sectors - sd->revived) << DEV_BSHIFT;
    sd->reviver = curproc->p_pid;			    /* note who last had a bash at it */

    /* Now decide where to read from */
    switch (plex->organization) {
    case plex_concat:
	plexblkno = sd->revived + sd->plexoffset;	    /* corresponding address in plex */
	break;

    case plex_striped:
	stripeoffset = sd->revived % plex->stripesize;	    /* offset from beginning of stripe */
	if (stripeoffset + (size >> DEV_BSHIFT) > plex->stripesize)
	    size = (plex->stripesize - stripeoffset) << DEV_BSHIFT;
	plexblkno = sd->plexoffset			    /* base */
	    + (sd->revived - stripeoffset) * plex->subdisks /* offset to beginning of stripe */
	    + stripeoffset;				    /* offset from beginning of stripe */
	break;

    case plex_raid4:
    case plex_raid5:
	stripeoffset = sd->revived % plex->stripesize;	    /* offset from beginning of stripe */
	plexblkno = sd->plexoffset			    /* base */
	    + (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */
	    +stripeoffset;				    /* offset from beginning of stripe */
	stripe = (sd->revived / plex->stripesize);	    /* stripe number */

	/* Make sure we don't go beyond the end of the band. */
	size = min(size, (plex->stripesize - stripeoffset) << DEV_BSHIFT);
	if (plex->organization == plex_raid4)
	    psd = plex->subdisks - 1;			    /* parity subdisk for this stripe */
	else
	    psd = plex->subdisks - 1 - stripe % plex->subdisks;	/* parity subdisk for this stripe */
	paritysd = plex->sdnos[psd] == sdno;		    /* note if it's the parity subdisk */

	/*
	 * Now adjust for the strangenesses
	 * in RAID-4 and RAID-5 striping.
	 */
	if (sd->plexsdno > psd)				    /* beyond the parity stripe, */
	    plexblkno -= plex->stripesize;		    /* one stripe less */
	else if (paritysd)
	    plexblkno -= plex->stripesize * sd->plexsdno;   /* go back to the beginning of the band */
	break;

    case plex_disorg:					    /* to keep the compiler happy */
	break;
    }

    if (paritysd) {					    /* we're reviving a parity block, */
	bp = parityrebuild(plex, sd->revived, size, rebuildparity, &lock, NULL); /* do the grunt work */
	if (bp == NULL)					    /* no buffer space */
	    return ENOMEM;				    /* chicken out */
    } else {						    /* data block */
	s = splbio();
	bp = geteblk(size);				    /* Get a buffer */
	splx(s);
	if (bp == NULL)
	    return ENOMEM;

	/*
	 * Amount to transfer: block size, unless it
	 * would overlap the end.
	 */
	bp->b_bcount = size;
	bp->b_resid = bp->b_bcount;
	bp->b_blkno = plexblkno;			    /* start here */
	if (isstriped(plex))				    /* we need to lock striped plexes */
	    lock = lockrange(plexblkno << DEV_BSHIFT, bp, plex); /* lock it */
	if (vol != NULL)				    /* it's part of a volume, */
	    /*
	       * First, read the data from the volume.  We
	       * don't care which plex, that's bre's job.
	     */
	    bp->b_dev = VINUM_VOL(plex->volno);		    /* create the device number */
	else						    /* it's an unattached plex */
	    bp->b_dev = VINUM_PLEX(sd->plexno);		    /* create the device number */

	bp->b_flags = B_BUSY | B_READ;			    /* either way, read it */
	vinumstart(bp, 1);
	biowait(bp);
    }

    if (bp->b_flags & B_ERROR) {
	error = bp->b_error;
	if (lock)					    /* we took a lock, */
	    unlockrange(sd->plexno, lock);		    /* give it back */
    } else
	/* Now write to the subdisk */
    {
	bp->b_dev = VINUM_SD(sdno);			    /* create the device number */
	bp->b_flags = B_BUSY | B_WRITE;			    /* and make this a write */
	bp->b_resid = bp->b_bcount;
	bp->b_blkno = sd->revived;			    /* write it to here */
	sdio(bp);					    /* perform the I/O */
	biowait(bp);
	if (bp->b_flags & B_ERROR)
	    error = bp->b_error;
	else {
	    sd->revived += bp->b_bcount >> DEV_BSHIFT;	    /* moved this much further down */
	    if (sd->revived >= sd->sectors) {		    /* finished */
		sd->revived = 0;
		set_sd_state(sdno, sd_up, setstate_force);  /* bring the sd up */
		log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
		save_config();				    /* and save the updated configuration */
		error = 0;				    /* we're done */
	    }
	}
	if (lock)					    /* we took a lock, */
	    unlockrange(sd->plexno, lock);		    /* give it back */
	while (sd->waitlist) {				    /* we have waiting requests */
#ifdef VINUMDEBUG
	    struct request *rq = sd->waitlist;

	    if (debug & DEBUG_REVIVECONFLICT)
		log(LOG_DEBUG,
		    "Relaunch revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%llx, length %d\n",
		    rq->sdno,
		    rq,
		    rq->bp->b_flags & B_READ ? "Read" : "Write",
		    major(rq->bp->b_dev),
		    minor(rq->bp->b_dev),
		    rq->bp->b_blkno,
		    rq->bp->b_bcount);
#endif
	    launch_requests(sd->waitlist, 1);		    /* do them now */
	    sd->waitlist = sd->waitlist->next;		    /* and move on to the next */
	}
    }
    bp->b_flags &= ~B_ERROR;
    bp->b_flags |= B_INVAL;
    brelse(bp);						    /* is this kosher? */
    return error;
}

/*
 * Check or rebuild the parity blocks of a RAID-4
 * or RAID-5 plex.
 *
 * The variables plex->checkblock and
 * plex->rebuildblock represent the
 * subdisk-relative address of the stripe we're
 * looking at, not the plex-relative address.  We
 * store it in the plex and not as a local
 * variable because this function could be
 * stopped, and we don't want to repeat the part
 * we've already done.  This is also the reason
 * why we don't initialize it here except at the
 * end.  It gets initialized with the plex on
 * creation.
 *
 * Each call to this function processes at most
 * one stripe.  We can't loop in this function,
 * because we're unstoppable, so we have to be
 * called repeatedly from userland.
 */
void
parityops(struct vinum_ioctl_msg *data)
{
    int plexno;
    struct plex *plex;
    int size;						    /* I/O transfer size, bytes */
    int stripe;						    /* stripe number in plex */
    int psd;						    /* parity subdisk number */
    struct rangelock *lock;				    /* lock on stripe */
    struct _ioctl_reply *reply;
    off_t pstripe;					    /* pointer to our stripe counter */
    struct buf *pbp;
    off_t errorloc;					    /* offset of parity error */
    enum parityop op;					    /* operation to perform */

    plexno = data->index;
    op = data->op;
    pbp = NULL;
    reply = (struct _ioctl_reply *) data;
    reply->error = EAGAIN;				    /* expect to repeat this call */
    plex = &PLEX[plexno];
    if (!isparity(plex)) {				    /* not RAID-4 or RAID-5 */
	reply->error = EINVAL;
	return;
    } else if (plex->state < plex_flaky) {
	reply->error = EIO;
	strcpy(reply->msg, "Plex is not completely accessible\n");
	return;
    }
    pstripe = data->offset;
    stripe = pstripe / plex->stripesize;		    /* stripe number */
    psd = plex->subdisks - 1 - stripe % plex->subdisks;	    /* parity subdisk for this stripe */
    size = min(DEFAULT_REVIVE_BLOCKSIZE,		    /* one block at a time */
	plex->stripesize << DEV_BSHIFT);

    pbp = parityrebuild(plex, pstripe, size, op, &lock, &errorloc); /* do the grunt work */
    if (pbp == NULL) {					    /* no buffer space */
	reply->error = ENOMEM;
	return;						    /* chicken out */
    }
    /*
     * Now we have a result in the data buffer of
     * the parity buffer header, which we have kept.
     * Decide what to do with it.
     */
    reply->msg[0] = '\0';				    /* until shown otherwise */
    if ((pbp->b_flags & B_ERROR) == 0) {		    /* no error */
	if ((op == rebuildparity)
	    || (op == rebuildandcheckparity)) {
	    pbp->b_flags &= ~B_READ;
	    pbp->b_resid = pbp->b_bcount;
	    sdio(pbp);					    /* write the parity block */
	    biowait(pbp);
	}
	if (((op == checkparity)
		|| (op == rebuildandcheckparity))
	    && (errorloc != -1)) {
	    if (op == checkparity)
		reply->error = EIO;
	    sprintf(reply->msg,
		"Parity incorrect at offset 0x%llx\n",
		(long long int) errorloc);
	}
	if (reply->error == EAGAIN) {			    /* still OK, */
	    plex->checkblock = pstripe + (pbp->b_bcount >> DEV_BSHIFT);	/* moved this much further down */
	    if (plex->checkblock >= SD[plex->sdnos[0]].sectors) { /* finished */
		plex->checkblock = 0;
		reply->error = 0;
	    }
	}
    }
    if (pbp->b_flags & B_ERROR)
	reply->error = pbp->b_error;
    pbp->b_flags |= B_INVAL;
    pbp->b_flags &= ~B_ERROR;
    brelse(pbp);
    unlockrange(plexno, lock);
}

/*
 * Rebuild a parity stripe.  Return pointer to
 * parity bp.  On return,
 *
 * 1.  The band is locked.  The caller must unlock
 *     the band and release the buffer header.
 *
 * 2.  All buffer headers except php have been
 *     released.  The caller must release pbp.
 *
 * 3.  For checkparity and rebuildandcheckparity,
 *     the parity is compared with the current
 *     parity block.  If it's different, the
 *     offset of the error is returned to
 *     errorloc.  The caller can set the value of
 *     the pointer to NULL if this is called for
 *     rebuilding parity.
 *
 * pstripe is the subdisk-relative base address of
 * the data to be reconstructed, size is the size
 * of the transfer in bytes.
 */
struct buf *
parityrebuild(struct plex *plex,
    u_int64_t pstripe,
    int size,
    enum parityop op,
    struct rangelock **lockp,
    off_t * errorloc)
{
    int error;
    int s;
    int sdno;
    u_int64_t stripe;					    /* stripe number */
    int *parity_buf;					    /* buffer address for current parity block */
    int *newparity_buf;					    /* and for new parity block */
    int mysize;						    /* I/O transfer size for this transfer */
    int isize;						    /* mysize in ints */
    int i;
    int psd;						    /* parity subdisk number */
    int newpsd;						    /* and "subdisk number" of new parity */
    struct buf **bpp;					    /* pointers to our bps */
    struct buf *pbp;					    /* buffer header for parity stripe */
    int *sbuf;
    int bufcount;					    /* number of buffers we need */

    stripe = pstripe / plex->stripesize;		    /* stripe number */
    psd = plex->subdisks - 1 - stripe % plex->subdisks;	    /* parity subdisk for this stripe */
    parity_buf = NULL;					    /* to keep the compiler happy */
    error = 0;

    /*
     * It's possible that the default transfer size
     * we chose is not a factor of the stripe size.
     * We *must* limit this operation to a single
     * stripe, at least for RAID-5 rebuild, since
     * the parity subdisk changes between stripes,
     * so in this case we need to perform a short
     * transfer.  Set variable mysize to reflect
     * this.
     */
    mysize = min(size, (plex->stripesize * (stripe + 1) - pstripe) << DEV_BSHIFT);
    isize = mysize / (sizeof(int));			    /* number of ints in the buffer */
    bufcount = plex->subdisks + 1;			    /* sd buffers plus result buffer */
    newpsd = plex->subdisks;
    bpp = (struct buf **) Malloc(bufcount * sizeof(struct buf *)); /* array of pointers to bps */

    /* First, build requests for all subdisks */
    for (sdno = 0; sdno < bufcount; sdno++) {		    /* for each subdisk */
	if ((sdno != psd) || (op != rebuildparity)) {
	    /* Get a buffer header and initialize it. */
	    s = splbio();
	    bpp[sdno] = geteblk(mysize);		    /* Get a buffer */
	    if (bpp[sdno] == NULL) {
		while (sdno-- > 0) {			    /* release the ones we got */
		    bpp[sdno]->b_flags |= B_INVAL;
		    brelse(bpp[sdno]);			    /* give back our resources */
		}
		splx(s);
		printf("vinum: can't allocate buffer space for parity op.\n");
		return NULL;				    /* no bpps */
	    }
	    splx(s);
	    if (sdno == psd)
		parity_buf = (int *) bpp[sdno]->b_data;
	    if (sdno == newpsd)				    /* the new one? */
		bpp[sdno]->b_dev = VINUM_SD(plex->sdnos[psd]); /* write back to the parity SD */
	    else
		bpp[sdno]->b_dev = VINUM_SD(plex->sdnos[sdno]);	/* device number */
	    bpp[sdno]->b_flags = B_READ;		    /* either way, read it */
	    bpp[sdno]->b_bcount = mysize;
	    bpp[sdno]->b_resid = bpp[sdno]->b_bcount;
	    bpp[sdno]->b_blkno = pstripe;		    /* transfer from here */
	}
    }

    /* Initialize result buffer */
    pbp = bpp[newpsd];
    newparity_buf = (int *) bpp[newpsd]->b_data;
    bzero(newparity_buf, mysize);

    /*
     * Now lock the stripe with the first non-parity
     * bp as locking bp.
     */
    *lockp = lockrange(pstripe * plex->stripesize * (plex->subdisks - 1),
	bpp[psd ? 0 : 1],
	plex);

    /*
     * Then issue requests for all subdisks in
     * parallel.  Don't transfer the parity stripe
     * if we're rebuilding parity, unless we also
     * want to check it.
     */
    for (sdno = 0; sdno < plex->subdisks; sdno++) {	    /* for each real subdisk */
	if ((sdno != psd) || (op != rebuildparity)) {
	    bpp[sdno]->b_flags |= B_BUSY;
	    sdio(bpp[sdno]);
	}
    }

    /*
     * Next, wait for the requests to complete.
     * We wait in the order in which they were
     * issued, which isn't necessarily the order in
     * which they complete, but we don't have a
     * convenient way of doing the latter, and the
     * delay is minimal.
     */
    for (sdno = 0; sdno < plex->subdisks; sdno++) {	    /* for each subdisk */
	if ((sdno != psd) || (op != rebuildparity)) {
	    biowait(bpp[sdno]);
	    if (bpp[sdno]->b_flags & B_ERROR)		    /* can't read, */
		error = bpp[sdno]->b_error;
	    else if (sdno != psd) {			    /* update parity */
		sbuf = (int *) bpp[sdno]->b_data;
		for (i = 0; i < isize; i++)
		    ((int *) newparity_buf)[i] ^= sbuf[i];  /* xor in the buffer */
	    }
	}
	if (sdno != psd) {				    /* release all bps except parity */
	    bpp[sdno]->b_flags |= B_INVAL;
	    brelse(bpp[sdno]);				    /* give back our resources */
	}
    }

    /*
     * If we're checking, compare the calculated
     * and the read parity block.  If they're
     * different, return the plex-relative offset;
     * otherwise return -1.
     */
    if ((op == checkparity)
	|| (op == rebuildandcheckparity)) {
	*errorloc = -1;					    /* no error yet */
	for (i = 0; i < isize; i++) {
	    if (parity_buf[i] != newparity_buf[i]) {
		*errorloc = (off_t) (pstripe << DEV_BSHIFT) * (plex->subdisks - 1)
		    + i * sizeof(int);
		break;
	    }
	}
	bpp[psd]->b_flags |= B_INVAL;
	brelse(bpp[psd]);				    /* give back our resources */
    }
    /* release our resources */
    Free(bpp);
    if (error) {
	pbp->b_flags |= B_ERROR;
	pbp->b_error = error;
    }
    pbp->b_flags |= B_BUSY;				    /* return busy bp */
    return pbp;
}

/*
 * Initialize a subdisk by writing zeroes to the
 * complete address space.  If verify is set,
 * check each transfer for correctness.
 *
 * Each call to this function writes (and maybe
 * checks) a single block.
 */
int
initsd(int sdno, int verify)
{
    int s;						    /* priority level */
    struct sd *sd;
    struct plex *plex;
    struct volume *vol;
    struct buf *bp;
    int error;
    int size;						    /* size of init block, bytes */
    daddr_t plexblkno;					    /* lblkno in plex */
    int verified;					    /* set when we're happy with what we wrote */

    error = 0;
    plexblkno = 0;					    /* to keep the compiler happy */
    sd = &SD[sdno];
    if (sd->plexno < 0)					    /* no plex? */
	return EINVAL;
    plex = &PLEX[sd->plexno];				    /* point to plex */
    if (plex->volno >= 0)
	vol = &VOL[plex->volno];
    else
	vol = NULL;

    if (sd->init_blocksize == 0) {
	if (plex->stripesize != 0)			    /* we're striped, don't init more than */
	    sd->init_blocksize = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */
		plex->stripesize << DEV_BSHIFT);
	else
	    sd->init_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
    } else if (sd->init_blocksize > MAX_REVIVE_BLOCKSIZE)
	sd->init_blocksize = MAX_REVIVE_BLOCKSIZE;

    size = min(sd->init_blocksize >> DEV_BSHIFT, sd->sectors - sd->initialized) << DEV_BSHIFT;

    verified = 0;
    while (!verified) {					    /* until we're happy with it, */
	s = splbio();
	bp = geteblk(size);				    /* Get a buffer */
	splx(s);
	if (bp == NULL)
	    return ENOMEM;

	bp->b_bcount = size;
	bp->b_resid = bp->b_bcount;
	bp->b_blkno = sd->initialized;			    /* write it to here */
	bzero(bp->b_data, bp->b_bcount);
	bp->b_dev = VINUM_SD(sdno);			    /* create the device number */
	bp->b_flags |= B_BUSY;
	bp->b_flags &= ~B_READ;
	sdio(bp);					    /* perform the I/O */
	biowait(bp);
	if (bp->b_flags & B_ERROR)
	    error = bp->b_error;
	bp->b_flags |= B_INVAL;
	bp->b_flags &= ~B_ERROR;
	brelse(bp);					    /* is this kosher? */
	if ((error == 0) && verify) {			    /* check that it got there */
	    s = splbio();
	    bp = geteblk(size);				    /* get a buffer */
	    if (bp == NULL) {
		splx(s);
		error = ENOMEM;
	    } else {
		bp->b_bcount = size;
		bp->b_resid = bp->b_bcount;
		bp->b_blkno = sd->initialized;		    /* read from here */
		bp->b_dev = VINUM_SD(sdno);		    /* create the device number */
		bp->b_flags |= B_READ;			    /* read it back */
		splx(s);
		bp->b_flags |= B_BUSY;
		sdio(bp);
		biowait(bp);
		/*
		 * XXX Bug fix code.  This is hopefully no
		 * longer needed (21 February 2000).
		 */
		if (bp->b_flags & B_ERROR)
		    error = bp->b_error;
		else if ((*bp->b_data != 0)		    /* first word spammed */
		||(bcmp(bp->b_data, &bp->b_data[1], bp->b_bcount - 1))) { /* or one of the others */
		    printf("vinum: init error on %s, offset 0x%llx sectors\n",
			sd->name,
			(long long) sd->initialized);
		    verified = 0;
		} else
		    verified = 1;
		bp->b_flags |= B_INVAL;
		bp->b_flags &= ~B_ERROR;
		brelse(bp);
	    }
	} else
	    verified = 1;
    }
    if (error == 0) {					    /* did it, */
	sd->initialized += size >> DEV_BSHIFT;		    /* moved this much further down */
	if (sd->initialized >= sd->sectors) {		    /* finished */
	    sd->initialized = 0;
	    set_sd_state(sdno, sd_initialized, setstate_force);	/* bring the sd up */
	    log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
	    save_config();				    /* and save the updated configuration */
	} else						    /* more to go, */
	    error = EAGAIN;				    /* ya'll come back, see? */
    }
    return error;
}

/* Local Variables: */
/* fill-column: 50 */
/* End: */