541 lines
14 KiB
C
541 lines
14 KiB
C
/* $NetBSD: ld_ataraid.c,v 1.27 2008/05/04 13:59:41 xtraeme Exp $ */
|
|
|
|
/*
|
|
* Copyright (c) 2003 Wasabi Systems, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* Written by Jason R. Thorpe for Wasabi Systems, Inc.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
* must display the following acknowledgement:
|
|
* This product includes software developed for the NetBSD Project by
|
|
* Wasabi Systems, Inc.
|
|
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
|
|
* or promote products derived from this software without specific prior
|
|
* written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
|
|
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* Support for ATA RAID logical disks.
|
|
*
|
|
* Note that all the RAID happens in software here; the ATA RAID
|
|
* controllers we're dealing with (Promise, etc.) only support
|
|
* configuration data on the component disks, with the BIOS supporting
|
|
* booting from the RAID volumes.
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__KERNEL_RCSID(0, "$NetBSD: ld_ataraid.c,v 1.27 2008/05/04 13:59:41 xtraeme Exp $");
|
|
|
|
#include "rnd.h"
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/conf.h>
|
|
#include <sys/kernel.h>
|
|
#include <sys/device.h>
|
|
#include <sys/buf.h>
|
|
#include <sys/bufq.h>
|
|
#include <sys/dkio.h>
|
|
#include <sys/disk.h>
|
|
#include <sys/disklabel.h>
|
|
#include <sys/fcntl.h>
|
|
#include <sys/malloc.h>
|
|
#include <sys/vnode.h>
|
|
#include <sys/kauth.h>
|
|
#if NRND > 0
|
|
#include <sys/rnd.h>
|
|
#endif
|
|
|
|
#include <miscfs/specfs/specdev.h>
|
|
|
|
#include <dev/ldvar.h>
|
|
|
|
#include <dev/ata/ata_raidvar.h>
|
|
|
|
struct ld_ataraid_softc {
|
|
struct ld_softc sc_ld;
|
|
|
|
struct ataraid_array_info *sc_aai;
|
|
struct vnode *sc_vnodes[ATA_RAID_MAX_DISKS];
|
|
|
|
void (*sc_iodone)(struct buf *);
|
|
};
|
|
|
|
static int ld_ataraid_match(struct device *, struct cfdata *, void *);
|
|
static void ld_ataraid_attach(struct device *, struct device *, void *);
|
|
|
|
static int ld_ataraid_dump(struct ld_softc *, void *, int, int);
|
|
|
|
static int ld_ataraid_start_span(struct ld_softc *, struct buf *);
|
|
|
|
static int ld_ataraid_start_raid0(struct ld_softc *, struct buf *);
|
|
static void ld_ataraid_iodone_raid0(struct buf *);
|
|
|
|
CFATTACH_DECL_NEW(ld_ataraid, sizeof(struct ld_ataraid_softc),
|
|
ld_ataraid_match, ld_ataraid_attach, NULL, NULL);
|
|
|
|
static int ld_ataraid_initialized;
|
|
static struct pool ld_ataraid_cbufpl;
|
|
|
|
struct cbuf {
|
|
struct buf cb_buf; /* new I/O buf */
|
|
struct buf *cb_obp; /* ptr. to original I/O buf */
|
|
struct ld_ataraid_softc *cb_sc; /* pointer to ld softc */
|
|
u_int cb_comp; /* target component */
|
|
SIMPLEQ_ENTRY(cbuf) cb_q; /* fifo of component buffers */
|
|
struct cbuf *cb_other; /* other cbuf in case of mirror */
|
|
int cb_flags;
|
|
#define CBUF_IODONE 0x00000001 /* I/O is already successfully done */
|
|
};
|
|
|
|
#define CBUF_GET() pool_get(&ld_ataraid_cbufpl, PR_NOWAIT);
|
|
#define CBUF_PUT(cbp) pool_put(&ld_ataraid_cbufpl, (cbp))
|
|
|
|
static int
|
|
ld_ataraid_match(device_t parent, cfdata_t match, void *aux)
|
|
{
|
|
|
|
return (1);
|
|
}
|
|
|
|
static void
|
|
ld_ataraid_attach(device_t parent, device_t self, void *aux)
|
|
{
|
|
struct ld_ataraid_softc *sc = device_private(self);
|
|
struct ld_softc *ld = &sc->sc_ld;
|
|
struct ataraid_array_info *aai = aux;
|
|
const char *level;
|
|
struct vnode *vp;
|
|
char unklev[32];
|
|
u_int i;
|
|
|
|
if (ld_ataraid_initialized == 0) {
|
|
ld_ataraid_initialized = 1;
|
|
pool_init(&ld_ataraid_cbufpl, sizeof(struct cbuf), 0,
|
|
0, 0, "ldcbuf", NULL, IPL_BIO);
|
|
}
|
|
|
|
sc->sc_aai = aai; /* this data persists */
|
|
|
|
ld->sc_maxxfer = MAXPHYS * aai->aai_width; /* XXX */
|
|
ld->sc_secperunit = aai->aai_capacity;
|
|
ld->sc_secsize = 512; /* XXX */
|
|
ld->sc_maxqueuecnt = 128; /* XXX */
|
|
ld->sc_dump = ld_ataraid_dump;
|
|
|
|
switch (aai->aai_level) {
|
|
case AAI_L_SPAN:
|
|
level = "SPAN";
|
|
ld->sc_start = ld_ataraid_start_span;
|
|
sc->sc_iodone = ld_ataraid_iodone_raid0;
|
|
break;
|
|
|
|
case AAI_L_RAID0:
|
|
level = "RAID-0";
|
|
ld->sc_start = ld_ataraid_start_raid0;
|
|
sc->sc_iodone = ld_ataraid_iodone_raid0;
|
|
break;
|
|
|
|
case AAI_L_RAID1:
|
|
level = "RAID-1";
|
|
ld->sc_start = ld_ataraid_start_raid0;
|
|
sc->sc_iodone = ld_ataraid_iodone_raid0;
|
|
break;
|
|
|
|
case AAI_L_RAID0 | AAI_L_RAID1:
|
|
level = "RAID-10";
|
|
ld->sc_start = ld_ataraid_start_raid0;
|
|
sc->sc_iodone = ld_ataraid_iodone_raid0;
|
|
break;
|
|
|
|
default:
|
|
snprintf(unklev, sizeof(unklev), "<unknown level 0x%x>",
|
|
aai->aai_level);
|
|
level = unklev;
|
|
}
|
|
|
|
aprint_naive(": ATA %s array\n", level);
|
|
aprint_normal(": %s ATA %s array\n",
|
|
ata_raid_type_name(aai->aai_type), level);
|
|
|
|
if (ld->sc_start == NULL) {
|
|
aprint_error_dev(&ld->sc_dv, "unsupported array type\n");
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* We get a geometry from the device; use it.
|
|
*/
|
|
ld->sc_nheads = aai->aai_heads;
|
|
ld->sc_nsectors = aai->aai_sectors;
|
|
ld->sc_ncylinders = aai->aai_cylinders;
|
|
|
|
/*
|
|
* Configure all the component disks.
|
|
*/
|
|
for (i = 0; i < aai->aai_ndisks; i++) {
|
|
struct ataraid_disk_info *adi = &aai->aai_disks[i];
|
|
int bmajor, error;
|
|
dev_t dev;
|
|
|
|
bmajor = devsw_name2blk(device_xname(adi->adi_dev), NULL, 0);
|
|
dev = MAKEDISKDEV(bmajor, device_unit(adi->adi_dev), RAW_PART);
|
|
error = bdevvp(dev, &vp);
|
|
if (error)
|
|
break;
|
|
error = VOP_OPEN(vp, FREAD|FWRITE, NOCRED);
|
|
if (error) {
|
|
vput(vp);
|
|
/*
|
|
* XXX This is bogus. We should just mark the
|
|
* XXX component as FAILED, and write-back new
|
|
* XXX config blocks.
|
|
*/
|
|
break;
|
|
}
|
|
|
|
VOP_UNLOCK(vp, 0);
|
|
sc->sc_vnodes[i] = vp;
|
|
}
|
|
if (i == aai->aai_ndisks) {
|
|
ld->sc_flags = LDF_ENABLED;
|
|
goto finish;
|
|
}
|
|
|
|
for (i = 0; i < aai->aai_ndisks; i++) {
|
|
vp = sc->sc_vnodes[i];
|
|
sc->sc_vnodes[i] = NULL;
|
|
if (vp != NULL)
|
|
(void) vn_close(vp, FREAD|FWRITE, NOCRED);
|
|
}
|
|
|
|
finish:
|
|
ldattach(ld);
|
|
}
|
|
|
|
static struct cbuf *
|
|
ld_ataraid_make_cbuf(struct ld_ataraid_softc *sc, struct buf *bp,
|
|
u_int comp, daddr_t bn, void *addr, long bcount)
|
|
{
|
|
struct cbuf *cbp;
|
|
|
|
cbp = CBUF_GET();
|
|
if (cbp == NULL)
|
|
return (NULL);
|
|
buf_init(&cbp->cb_buf);
|
|
cbp->cb_buf.b_flags = bp->b_flags;
|
|
cbp->cb_buf.b_oflags = bp->b_oflags;
|
|
cbp->cb_buf.b_cflags = bp->b_cflags;
|
|
cbp->cb_buf.b_iodone = sc->sc_iodone;
|
|
cbp->cb_buf.b_proc = bp->b_proc;
|
|
cbp->cb_buf.b_vp = sc->sc_vnodes[comp];
|
|
cbp->cb_buf.b_objlock = &sc->sc_vnodes[comp]->v_interlock;
|
|
cbp->cb_buf.b_blkno = bn + sc->sc_aai->aai_offset;
|
|
cbp->cb_buf.b_data = addr;
|
|
cbp->cb_buf.b_bcount = bcount;
|
|
|
|
/* Context for iodone */
|
|
cbp->cb_obp = bp;
|
|
cbp->cb_sc = sc;
|
|
cbp->cb_comp = comp;
|
|
cbp->cb_other = NULL;
|
|
cbp->cb_flags = 0;
|
|
|
|
return (cbp);
|
|
}
|
|
|
|
static int
|
|
ld_ataraid_start_span(struct ld_softc *ld, struct buf *bp)
|
|
{
|
|
struct ld_ataraid_softc *sc = (void *) ld;
|
|
struct ataraid_array_info *aai = sc->sc_aai;
|
|
struct ataraid_disk_info *adi;
|
|
SIMPLEQ_HEAD(, cbuf) cbufq;
|
|
struct cbuf *cbp;
|
|
char *addr;
|
|
daddr_t bn;
|
|
long bcount, rcount;
|
|
u_int comp;
|
|
|
|
/* Allocate component buffers. */
|
|
SIMPLEQ_INIT(&cbufq);
|
|
addr = bp->b_data;
|
|
|
|
/* Find the first component. */
|
|
comp = 0;
|
|
adi = &aai->aai_disks[comp];
|
|
bn = bp->b_rawblkno;
|
|
while (bn >= adi->adi_compsize) {
|
|
bn -= adi->adi_compsize;
|
|
adi = &aai->aai_disks[++comp];
|
|
}
|
|
|
|
bp->b_resid = bp->b_bcount;
|
|
|
|
for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
|
|
rcount = bp->b_bcount;
|
|
if ((adi->adi_compsize - bn) < btodb(rcount))
|
|
rcount = dbtob(adi->adi_compsize - bn);
|
|
|
|
cbp = ld_ataraid_make_cbuf(sc, bp, comp, bn, addr, rcount);
|
|
if (cbp == NULL) {
|
|
/* Free the already allocated component buffers. */
|
|
while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
|
|
SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
|
|
buf_destroy(&cbp->cb_buf);
|
|
CBUF_PUT(cbp);
|
|
}
|
|
return (EAGAIN);
|
|
}
|
|
|
|
/*
|
|
* For a span, we always know we advance to the next disk,
|
|
* and always start at offset 0 on that disk.
|
|
*/
|
|
adi = &aai->aai_disks[++comp];
|
|
bn = 0;
|
|
|
|
SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
|
|
addr += rcount;
|
|
}
|
|
|
|
/* Now fire off the requests. */
|
|
while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
|
|
SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
|
|
if ((cbp->cb_buf.b_flags & B_READ) == 0) {
|
|
mutex_enter(&cbp->cb_buf.b_vp->v_interlock);
|
|
cbp->cb_buf.b_vp->v_numoutput++;
|
|
mutex_exit(&cbp->cb_buf.b_vp->v_interlock);
|
|
}
|
|
VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
static int
|
|
ld_ataraid_start_raid0(struct ld_softc *ld, struct buf *bp)
|
|
{
|
|
struct ld_ataraid_softc *sc = (void *) ld;
|
|
struct ataraid_array_info *aai = sc->sc_aai;
|
|
struct ataraid_disk_info *adi;
|
|
SIMPLEQ_HEAD(, cbuf) cbufq;
|
|
struct cbuf *cbp, *other_cbp;
|
|
char *addr;
|
|
daddr_t bn, cbn, tbn, off;
|
|
long bcount, rcount;
|
|
u_int comp;
|
|
const int read = bp->b_flags & B_READ;
|
|
const int mirror = aai->aai_level & AAI_L_RAID1;
|
|
int error;
|
|
|
|
/* Allocate component buffers. */
|
|
SIMPLEQ_INIT(&cbufq);
|
|
addr = bp->b_data;
|
|
bn = bp->b_rawblkno;
|
|
|
|
bp->b_resid = bp->b_bcount;
|
|
|
|
for (bcount = bp->b_bcount; bcount > 0; bcount -= rcount) {
|
|
tbn = bn / aai->aai_interleave;
|
|
off = bn % aai->aai_interleave;
|
|
|
|
if (__predict_false(tbn == aai->aai_capacity /
|
|
aai->aai_interleave)) {
|
|
/* Last stripe. */
|
|
daddr_t sz = (aai->aai_capacity -
|
|
(tbn * aai->aai_interleave)) /
|
|
aai->aai_width;
|
|
comp = off / sz;
|
|
cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
|
|
(off % sz);
|
|
rcount = min(bcount, dbtob(sz));
|
|
} else {
|
|
comp = tbn % aai->aai_width;
|
|
cbn = ((tbn / aai->aai_width) * aai->aai_interleave) +
|
|
off;
|
|
rcount = min(bcount, dbtob(aai->aai_interleave - off));
|
|
}
|
|
|
|
/*
|
|
* See if a component is valid.
|
|
*/
|
|
try_mirror:
|
|
adi = &aai->aai_disks[comp];
|
|
if ((adi->adi_status & ADI_S_ONLINE) == 0) {
|
|
if (mirror && comp < aai->aai_width) {
|
|
comp += aai->aai_width;
|
|
goto try_mirror;
|
|
}
|
|
|
|
/*
|
|
* No component available.
|
|
*/
|
|
error = EIO;
|
|
goto free_and_exit;
|
|
}
|
|
|
|
cbp = ld_ataraid_make_cbuf(sc, bp, comp, cbn, addr, rcount);
|
|
if (cbp == NULL) {
|
|
resource_shortage:
|
|
error = EAGAIN;
|
|
free_and_exit:
|
|
/* Free the already allocated component buffers. */
|
|
while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
|
|
SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
|
|
buf_destroy(&cbp->cb_buf);
|
|
CBUF_PUT(cbp);
|
|
}
|
|
return (error);
|
|
}
|
|
SIMPLEQ_INSERT_TAIL(&cbufq, cbp, cb_q);
|
|
if (mirror && !read && comp < aai->aai_width) {
|
|
comp += aai->aai_width;
|
|
adi = &aai->aai_disks[comp];
|
|
if (adi->adi_status & ADI_S_ONLINE) {
|
|
other_cbp = ld_ataraid_make_cbuf(sc, bp,
|
|
comp, cbn, addr, rcount);
|
|
if (other_cbp == NULL)
|
|
goto resource_shortage;
|
|
SIMPLEQ_INSERT_TAIL(&cbufq, other_cbp, cb_q);
|
|
other_cbp->cb_other = cbp;
|
|
cbp->cb_other = other_cbp;
|
|
}
|
|
}
|
|
bn += btodb(rcount);
|
|
addr += rcount;
|
|
}
|
|
|
|
/* Now fire off the requests. */
|
|
while ((cbp = SIMPLEQ_FIRST(&cbufq)) != NULL) {
|
|
SIMPLEQ_REMOVE_HEAD(&cbufq, cb_q);
|
|
if ((cbp->cb_buf.b_flags & B_READ) == 0) {
|
|
mutex_enter(&cbp->cb_buf.b_vp->v_interlock);
|
|
cbp->cb_buf.b_vp->v_numoutput++;
|
|
mutex_exit(&cbp->cb_buf.b_vp->v_interlock);
|
|
}
|
|
VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Called at interrupt time. Mark the component as done and if all
|
|
* components are done, take an "interrupt".
|
|
*/
|
|
static void
|
|
ld_ataraid_iodone_raid0(struct buf *vbp)
|
|
{
|
|
struct cbuf *cbp = (struct cbuf *) vbp, *other_cbp;
|
|
struct buf *bp = cbp->cb_obp;
|
|
struct ld_ataraid_softc *sc = cbp->cb_sc;
|
|
struct ataraid_array_info *aai = sc->sc_aai;
|
|
struct ataraid_disk_info *adi;
|
|
long count;
|
|
int s, iodone;
|
|
|
|
s = splbio();
|
|
|
|
iodone = cbp->cb_flags & CBUF_IODONE;
|
|
other_cbp = cbp->cb_other;
|
|
if (other_cbp != NULL)
|
|
/* You are alone */
|
|
other_cbp->cb_other = NULL;
|
|
|
|
if (cbp->cb_buf.b_error != 0) {
|
|
/*
|
|
* Mark this component broken.
|
|
*/
|
|
adi = &aai->aai_disks[cbp->cb_comp];
|
|
adi->adi_status &= ~ADI_S_ONLINE;
|
|
|
|
printf("%s: error %d on component %d (%s)\n",
|
|
device_xname(&sc->sc_ld.sc_dv), bp->b_error, cbp->cb_comp,
|
|
device_xname(adi->adi_dev));
|
|
|
|
/*
|
|
* If we didn't see an error yet and we are reading
|
|
* RAID1 disk, try another component.
|
|
*/
|
|
if (bp->b_error == 0 &&
|
|
(cbp->cb_buf.b_flags & B_READ) != 0 &&
|
|
(aai->aai_level & AAI_L_RAID1) != 0 &&
|
|
cbp->cb_comp < aai->aai_width) {
|
|
cbp->cb_comp += aai->aai_width;
|
|
adi = &aai->aai_disks[cbp->cb_comp];
|
|
if (adi->adi_status & ADI_S_ONLINE) {
|
|
cbp->cb_buf.b_error = 0;
|
|
VOP_STRATEGY(cbp->cb_buf.b_vp, &cbp->cb_buf);
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
if (iodone || other_cbp != NULL)
|
|
/*
|
|
* If I/O on other component successfully done
|
|
* or the I/O is still in progress, no need
|
|
* to tell an error to upper layer.
|
|
*/
|
|
;
|
|
else {
|
|
bp->b_error = cbp->cb_buf.b_error ?
|
|
cbp->cb_buf.b_error : EIO;
|
|
}
|
|
|
|
/* XXX Update component config blocks. */
|
|
|
|
} else {
|
|
/*
|
|
* If other I/O is still in progress, tell it that
|
|
* our I/O is successfully done.
|
|
*/
|
|
if (other_cbp != NULL)
|
|
other_cbp->cb_flags |= CBUF_IODONE;
|
|
}
|
|
count = cbp->cb_buf.b_bcount;
|
|
CBUF_PUT(cbp);
|
|
|
|
if (other_cbp != NULL)
|
|
goto out;
|
|
|
|
/* If all done, "interrupt". */
|
|
bp->b_resid -= count;
|
|
if (bp->b_resid < 0)
|
|
panic("ld_ataraid_iodone_raid0: count");
|
|
if (bp->b_resid == 0)
|
|
lddone(&sc->sc_ld, bp);
|
|
|
|
out:
|
|
splx(s);
|
|
}
|
|
|
|
static int
|
|
ld_ataraid_dump(struct ld_softc *sc, void *data,
|
|
int blkno, int blkcnt)
|
|
{
|
|
|
|
return (EIO);
|
|
}
|