535 lines
15 KiB
Groff
535 lines
15 KiB
Groff
.\" $NetBSD: disk.9,v 1.35 2009/12/30 14:53:02 wiz Exp $
|
|
.\"
|
|
.\" Copyright (c) 1995, 1996 Jason R. Thorpe.
|
|
.\" All rights reserved.
|
|
.\"
|
|
.\" Redistribution and use in source and binary forms, with or without
|
|
.\" modification, are permitted provided that the following conditions
|
|
.\" are met:
|
|
.\" 1. Redistributions of source code must retain the above copyright
|
|
.\" notice, this list of conditions and the following disclaimer.
|
|
.\" 2. Redistributions in binary form must reproduce the above copyright
|
|
.\" notice, this list of conditions and the following disclaimer in the
|
|
.\" documentation and/or other materials provided with the distribution.
|
|
.\" 3. All advertising materials mentioning features or use of this software
|
|
.\" must display the following acknowledgement:
|
|
.\" This product includes software developed for the NetBSD Project
|
|
.\" by Jason R. Thorpe.
|
|
.\" 4. The name of the author may not be used to endorse or promote products
|
|
.\" derived from this software without specific prior written permission.
|
|
.\"
|
|
.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
.\" IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
.\" BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
.\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
|
.\" AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
.\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
.\" SUCH DAMAGE.
|
|
.\"
|
|
.Dd December 30, 2009
|
|
.Dt DISK 9
|
|
.Os
|
|
.Sh NAME
|
|
.Nm disk ,
|
|
.Nm disk_init ,
|
|
.Nm disk_attach ,
|
|
.Nm disk_begindetach ,
|
|
.Nm disk_detach ,
|
|
.Nm disk_destroy ,
|
|
.Nm disk_busy ,
|
|
.Nm disk_unbusy ,
|
|
.Nm disk_isbusy ,
|
|
.Nm disk_find ,
|
|
.Nm disk_blocksize
|
|
.Nd generic disk framework
|
|
.Sh SYNOPSIS
|
|
.In sys/types.h
|
|
.In sys/disklabel.h
|
|
.In sys/disk.h
|
|
.Ft void
|
|
.Fn disk_init "struct disk *" "const char *name" "const struct dkdriver *driver"
|
|
.Ft void
|
|
.Fn disk_attach "struct disk *"
|
|
.Ft void
|
|
.Fn disk_begindetach "struct disk *" "int (*lastclose)(device_t)" "device_t self" "int flags"
|
|
.Ft void
|
|
.Fn disk_detach "struct disk *"
|
|
.Ft void
|
|
.Fn disk_destroy "struct disk *"
|
|
.Ft void
|
|
.Fn disk_busy "struct disk *"
|
|
.Ft void
|
|
.Fn disk_unbusy "struct disk *" "long bcount" "int read"
|
|
.Ft bool
|
|
.Fn disk_isbusy "struct disk *"
|
|
.Ft struct disk *
|
|
.Fn disk_find "const char *"
|
|
.Ft void
|
|
.Fn disk_blocksize "struct disk *" "int blocksize"
|
|
.Sh DESCRIPTION
|
|
The
|
|
.Nx
|
|
generic disk framework is designed to provide flexible,
|
|
scalable, and consistent handling of disk state and metrics information.
|
|
The fundamental component of this framework is the
|
|
.Nm disk
|
|
structure, which is defined as follows:
|
|
.Bd -literal
|
|
struct disk {
|
|
TAILQ_ENTRY(disk) dk_link; /* link in global disklist */
|
|
const char *dk_name; /* disk name */
|
|
prop_dictionary_t dk_info; /* reference to disk-info dictionary */
|
|
int dk_bopenmask; /* block devices open */
|
|
int dk_copenmask; /* character devices open */
|
|
int dk_openmask; /* composite (bopen|copen) */
|
|
int dk_state; /* label state ### */
|
|
int dk_blkshift; /* shift to convert DEV_BSIZE to blks */
|
|
int dk_byteshift; /* shift to convert bytes to blks */
|
|
|
|
/*
|
|
* Metrics data; note that some metrics may have no meaning
|
|
* on certain types of disks.
|
|
*/
|
|
struct io_stats *dk_stats;
|
|
|
|
const struct dkdriver *dk_driver; /* pointer to driver */
|
|
|
|
/*
|
|
* Information required to be the parent of a disk wedge.
|
|
*/
|
|
kmutex_t dk_rawlock; /* lock on these fields */
|
|
u_int dk_rawopens; /* # of openes of rawvp */
|
|
struct vnode *dk_rawvp; /* vnode for the RAW_PART bdev */
|
|
|
|
kmutex_t dk_openlock; /* lock on these and openmask */
|
|
u_int dk_nwedges; /* # of configured wedges */
|
|
/* all wedges on this disk */
|
|
LIST_HEAD(, dkwedge_softc) dk_wedges;
|
|
|
|
/*
|
|
* Disk label information. Storage for the in-core disk label
|
|
* must be dynamically allocated, otherwise the size of this
|
|
* structure becomes machine-dependent.
|
|
*/
|
|
daddr_t dk_labelsector; /* sector containing label */
|
|
struct disklabel *dk_label; /* label */
|
|
struct cpu_disklabel *dk_cpulabel;
|
|
};
|
|
.Ed
|
|
.Pp
|
|
The system maintains a global linked-list of all disks attached to the
|
|
system.
|
|
This list, called
|
|
.Nm disklist ,
|
|
may grow or shrink over time as disks are dynamically added and removed
|
|
from the system.
|
|
Drivers which currently make use of the detachment
|
|
capability of the framework are the
|
|
.Nm ccd ,
|
|
.Nm dm ,
|
|
and
|
|
.Nm vnd
|
|
pseudo-device drivers.
|
|
.Pp
|
|
The following is a brief description of each function in the framework:
|
|
.Bl -tag -width ".Fn disk_blocksize"
|
|
.It Fn disk_init
|
|
Initialize the disk structure.
|
|
.It Fn disk_attach
|
|
Attach a disk; allocate storage for the disklabel, set the
|
|
.Dq attached time
|
|
timestamp, insert the disk into the disklist, and increment the
|
|
system disk count.
|
|
.It Fn disk_begindetach
|
|
Check whether the disk is open, and if not, return 0.
|
|
If the disk is open, and
|
|
.Dv DETACH_FORCE
|
|
is not set in
|
|
.Fa flags ,
|
|
return
|
|
.Dv EBUSY .
|
|
Otherwise, call the provided
|
|
.Fa lastclose
|
|
routine
|
|
.Po
|
|
if not
|
|
.Dv NULL
|
|
.Pc
|
|
and return its exit code.
|
|
.It Fn disk_detach
|
|
Detach a disk; free storage for the disklabel, remove the disk
|
|
from the disklist, and decrement the system disk count.
|
|
If the count drops below zero, panic.
|
|
.It Fn disk_destroy
|
|
Release resources used by the disk structure when it is no longer
|
|
required.
|
|
.It Fn disk_busy
|
|
Increment the disk's
|
|
.Dq busy counter .
|
|
If this counter goes from 0 to 1, set the timestamp corresponding to
|
|
this transfer.
|
|
.It Fn disk_unbusy
|
|
Decrement a disk's busy counter.
|
|
If the count drops below zero, panic.
|
|
Get the current time, subtract it from the disk's timestamp, and add
|
|
the difference to the disk's running total.
|
|
Set the disk's timestamp to the current time.
|
|
If the provided byte count is greater than 0, add it to the disk's
|
|
running total and increment the number of transfers performed by the disk.
|
|
The third argument
|
|
.Ar read
|
|
specifies the direction of I/O;
|
|
if non-zero it means reading from the disk,
|
|
otherwise it means writing to the disk.
|
|
.It Fn disk_isbusy
|
|
Returns
|
|
.Ar true
|
|
if disk is marked as busy and false if it is not.
|
|
.It Fn disk_find
|
|
Return a pointer to the disk structure corresponding to the name provided,
|
|
or NULL if the disk does not exist.
|
|
.It Fn disk_blocksize
|
|
Initialize
|
|
.Fa dk_blkshift
|
|
and
|
|
.Fa dk_byteshift
|
|
members of
|
|
.Fa struct disk
|
|
with suitable values derived from the supplied physical blocksize.
|
|
It is only necessary to call this function if the device's physical blocksize
|
|
is not
|
|
.Dv DEV_BSIZE .
|
|
.El
|
|
.Pp
|
|
The functions typically called by device drivers are
|
|
.Fn disk_init
|
|
.Fn disk_attach ,
|
|
.Fn disk_begindetach ,
|
|
.Fn disk_detach ,
|
|
.Fn disk_destroy ,
|
|
.Fn disk_busy ,
|
|
.Fn disk_unbusy ,
|
|
and
|
|
.Fn disk_blocksize .
|
|
The function
|
|
.Fn disk_find
|
|
is provided as a utility function.
|
|
.Sh DISK IOCTLS
|
|
The following ioctls should be implemented by disk drivers:
|
|
.Bl -tag -width "xxxxxx"
|
|
.It Dv DIOCGDINFO "struct disklabel"
|
|
Get disklabel.
|
|
.It Dv DIOCSDINFO "struct disklabel"
|
|
Set in-memory disklabel.
|
|
.It Dv DIOCWDINFO "struct disklabel"
|
|
Set in-memory disklabel and write on-disk disklabel.
|
|
.It Dv DIOCGPART "struct partinfo"
|
|
Get partition information.
|
|
This is used internally.
|
|
.It Dv DIOCRFORMAT "struct format_op"
|
|
Read format.
|
|
.It Dv DIOCWFORMAT "struct format_op"
|
|
Write format.
|
|
.It Dv DIOCSSTEP "int"
|
|
Set step rate.
|
|
.It Dv DIOCSRETRIES "int"
|
|
Set number of retries.
|
|
.It Dv DIOCKLABEL "int"
|
|
Specify whether to keep or drop the in-memory disklabel
|
|
when the device is closed.
|
|
.It Dv DIOCWLABEL "int"
|
|
Enable or disable writing to the part of the disk that contains the label.
|
|
.It Dv DIOCSBAD "struct dkbad"
|
|
Set kernel dkbad.
|
|
.It Dv DIOCEJECT "int"
|
|
Eject removable disk.
|
|
.It Dv DIOCLOCK "int"
|
|
Lock or unlock disk pack.
|
|
For devices with removable media, locking is intended to prevent
|
|
the operator from removing the media.
|
|
.It Dv DIOCGDEFLABEL "struct disklabel"
|
|
Get default label.
|
|
.It Dv DIOCCLRLABEL
|
|
Clear disk label.
|
|
.It Dv DIOCGCACHE "int"
|
|
Get status of disk read and write caches.
|
|
The result is a bitmask containing the following values:
|
|
.Bl -tag -width DKCACHE_RCHANGE
|
|
.It Dv DKCACHE_READ
|
|
Read cache enabled.
|
|
.It Dv DKCACHE_WRITE
|
|
Write(back) cache enabled.
|
|
.It Dv DKCACHE_RCHANGE
|
|
Read cache enable is changeable.
|
|
.It Dv DKCACHE_WCHANGE
|
|
Write cache enable is changeable.
|
|
.It Dv DKCACHE_SAVE
|
|
Cache parameters may be saved, so that they persist across reboots
|
|
or device detach/attach cycles.
|
|
.El
|
|
.It Dv DIOCSCACHE "int"
|
|
Set status of disk read and write caches.
|
|
The input is a bitmask in the same format as used for
|
|
.Dv DIOCGCACHE .
|
|
.It Dv DIOCCACHESYNC "int"
|
|
Synchronise the disk cache.
|
|
This causes information in the disk's write cache (if any)
|
|
to be flushed to stable storage.
|
|
The argument specifies whether or not to force a flush even if
|
|
the kernel believes that there is no outstanding data.
|
|
.It Dv DIOCBSLIST "struct disk_badsecinfo"
|
|
Get bad sector list.
|
|
.It Dv DIOCBSFLUSH
|
|
Flush bad sector list.
|
|
.It Dv DIOCAWEDGE "struct dkwedge_info"
|
|
Add wedge.
|
|
.It Dv DIOCGWEDGEINFO "struct dkwedge_info"
|
|
Get wedge information.
|
|
.It Dv DIOCDWEDGE "struct dkwedge_info"
|
|
Delete wedge.
|
|
.It Dv DIOCLWEDGES "struct dkwedge_list"
|
|
List wedges.
|
|
.It Dv DIOCGSTRATEGY "struct disk_strategy"
|
|
Get disk buffer queue strategy.
|
|
.It Dv DIOCSSTRATEGY "struct disk_strategy"
|
|
Set disk buffer queue strategy.
|
|
.It Dv DIOCGDISKINFO "struct plistref"
|
|
Get disk-info dictionary.
|
|
.El
|
|
.Sh USING THE FRAMEWORK
|
|
This section includes a description on basic use of the framework
|
|
and example usage of its functions.
|
|
Actual implementation of a device driver which uses the framework
|
|
may vary.
|
|
.Pp
|
|
Each device in the system uses a
|
|
.Dq softc
|
|
structure which contains autoconfiguration and state information for that
|
|
device.
|
|
In the case of disks, the softc should also contain one instance
|
|
of the disk structure, e.g.:
|
|
.Bd -literal
|
|
struct foo_softc {
|
|
device_t sc_dev; /* generic device information */
|
|
struct disk sc_dk; /* generic disk information */
|
|
[ . . . more . . . ]
|
|
};
|
|
.Ed
|
|
.Pp
|
|
In order for the system to gather metrics data about a disk, the disk must
|
|
be registered with the system.
|
|
The
|
|
.Fn disk_attach
|
|
routine performs all of the functions currently required to register a disk
|
|
with the system including allocation of disklabel storage space,
|
|
recording of the time since boot that the disk was attached, and insertion
|
|
into the disklist.
|
|
Note that since this function allocates storage space for the disklabel,
|
|
it must be called before the disklabel is read from the media or used in
|
|
any other way.
|
|
Before
|
|
.Fn disk_attach
|
|
is called, a portions of the disk structure must be initialized with
|
|
data specific to that disk.
|
|
For example, in the
|
|
.Dq foo
|
|
disk driver, the following would be performed in the autoconfiguration
|
|
.Dq attach
|
|
routine:
|
|
.Bd -literal
|
|
void
|
|
fooattach(device_t parent, device_t self, void *aux)
|
|
{
|
|
struct foo_softc *sc = device_private(self);
|
|
[ . . . ]
|
|
|
|
/* Initialize and attach the disk structure. */
|
|
disk_init(\*[Am]sc-\*[Gt]sc_dk, device_xname(self), \*[Am]foodkdriver);
|
|
disk_attach(\*[Am]sc-\*[Gt]sc_dk);
|
|
|
|
/* Read geometry and fill in pertinent parts of disklabel. */
|
|
[ . . . ]
|
|
disk_blocksize(\*[Am]sc-\*[Gt]sc_dk, bytes_per_sector);
|
|
}
|
|
.Ed
|
|
.Pp
|
|
The
|
|
.Nm foodkdriver
|
|
above is the disk's
|
|
.Dq driver
|
|
switch.
|
|
This switch currently includes a pointer to the disk's
|
|
.Dq strategy
|
|
routine.
|
|
This switch needs to have global scope and should be initialized as follows:
|
|
.Bd -literal
|
|
void foostrategy(struct buf *);
|
|
|
|
const struct dkdriver foodkdriver = {
|
|
.d_strategy = foostrategy,
|
|
};
|
|
.Ed
|
|
.Pp
|
|
Once the disk is attached, metrics may be gathered on that disk.
|
|
In order to gather metrics data, the driver must tell the framework when
|
|
the disk starts and stops operations.
|
|
This functionality is provided by the
|
|
.Fn disk_busy
|
|
and
|
|
.Fn disk_unbusy
|
|
routines.
|
|
Because
|
|
.Nm struct disk
|
|
is part of device driver private data it needs to be guarded.
|
|
Mutual exclusion must be done by driver
|
|
.Fn disk_busy
|
|
and
|
|
.Fn disk_unbusy
|
|
are not thread safe.
|
|
The
|
|
.Fn disk_busy
|
|
routine should be called immediately before a command to the disk is
|
|
sent, e.g.:
|
|
.Bd -literal
|
|
void
|
|
foostart(sc)
|
|
struct foo_softc *sc;
|
|
{
|
|
[ . . . ]
|
|
|
|
/* Get buffer from drive's transfer queue. */
|
|
[ . . . ]
|
|
|
|
/* Build command to send to drive. */
|
|
[ . . . ]
|
|
|
|
/* Tell the disk framework we're going busy. */
|
|
mutex_enter(\*[Am]sc-\*[Gt]sc_dk_mtx);
|
|
disk_busy(\*[Am]sc-\*[Gt]sc_dk);
|
|
mutex_exit(\*[Am]sc-\*[Gt]sc_dk_mtx);
|
|
|
|
/* Send command to the drive. */
|
|
[ . . . ]
|
|
}
|
|
.Ed
|
|
.Pp
|
|
When
|
|
.Fn disk_busy
|
|
is called, a timestamp is taken if the disk's busy counter moves from
|
|
0 to 1, indicating the disk has gone from an idle to non-idle state.
|
|
At the end of a transaction, the
|
|
.Fn disk_unbusy
|
|
routine should be called.
|
|
This routine performs some consistency checks,
|
|
such as ensuring that the calls to
|
|
.Fn disk_busy
|
|
and
|
|
.Fn disk_unbusy
|
|
are balanced.
|
|
This routine also performs the actual metrics calculation.
|
|
A timestamp is taken and the difference from the timestamp taken in
|
|
.Fn disk_busy
|
|
is added to the disk's total running time.
|
|
The disk's timestamp is then updated in case there is more than one
|
|
pending transfer on the disk.
|
|
A byte count is also added to the disk's running total, and if greater than
|
|
zero, the number of transfers the disk has performed is incremented.
|
|
The third argument
|
|
.Ar read
|
|
specifies the direction of I/O;
|
|
if non-zero it means reading from the disk,
|
|
otherwise it means writing to the disk.
|
|
.Bd -literal
|
|
void
|
|
foodone(xfer)
|
|
struct foo_xfer *xfer;
|
|
{
|
|
struct foo_softc = (struct foo_softc *)xfer-\*[Gt]xf_softc;
|
|
struct buf *bp = xfer-\*[Gt]xf_buf;
|
|
long nbytes;
|
|
[ . . . ]
|
|
|
|
/*
|
|
* Get number of bytes transferred. If there is no buf
|
|
* associated with the xfer, we are being called at the
|
|
* end of a non-I/O command.
|
|
*/
|
|
if (bp == NULL)
|
|
nbytes = 0;
|
|
else
|
|
nbytes = bp-\*[Gt]b_bcount - bp-\*[Gt]b_resid;
|
|
|
|
[ . . . ]
|
|
|
|
mutex_enter(\*[Am]sc-\*[Gt]sc_dk_mtx);
|
|
/* Notify the disk framework that we've completed the transfer. */
|
|
disk_unbusy(\*[Am]sc-\*[Gt]sc_dk, nbytes,
|
|
bp != NULL ? bp-\*[Gt]b_flags \*[Am] B_READ : 0);
|
|
mutex_exit(\*[Am]sc-\*[Gt]sc_dk_mtx);
|
|
|
|
[ . . . ]
|
|
}
|
|
.Ed
|
|
.Pp
|
|
.Fn disk_isbusy
|
|
is used to get status of disk device it returns true if device is
|
|
currently busy and false if it is not.
|
|
Like
|
|
.Fn disk_busy
|
|
and
|
|
.Fn disk_unbusy
|
|
it requires explicit locking from user side.
|
|
.Sh CODE REFERENCES
|
|
This section describes places within the
|
|
.Nx
|
|
source tree where actual
|
|
code implementing or using the disk framework can be found.
|
|
All pathnames are relative to
|
|
.Pa /usr/src .
|
|
.Pp
|
|
The disk framework itself is implemented within the file
|
|
.Pa sys/kern/subr_disk.c .
|
|
Data structures and function prototypes for the framework are located in
|
|
.Pa sys/sys/disk.h .
|
|
.Pp
|
|
The
|
|
.Nx
|
|
machine-independent SCSI disk and CD-ROM drivers use the
|
|
disk framework.
|
|
They are located in
|
|
.Pa sys/scsi/sd.c
|
|
and
|
|
.Pa sys/scsi/cd.c .
|
|
.Pp
|
|
The
|
|
.Nx
|
|
.Nm ccd ,
|
|
.Nm dm ,
|
|
and
|
|
.Nm vnd
|
|
drivers use the detachment capability of the framework.
|
|
They are located in
|
|
.Pa sys/dev/ccd.c ,
|
|
.Pa sys/dev/vnd.c ,
|
|
and
|
|
.Pa sys/dev/dm/device-mapper.c .
|
|
.Sh SEE ALSO
|
|
.Xr ccd 4 ,
|
|
.Xr dm 4 ,
|
|
.Xr vnd 4
|
|
.Sh HISTORY
|
|
The
|
|
.Nx
|
|
generic disk framework appeared in
|
|
.Nx 1.2 .
|
|
.Sh AUTHORS
|
|
The
|
|
.Nx
|
|
generic disk framework was architected and implemented by
|
|
.An Jason R. Thorpe
|
|
.Aq thorpej@NetBSD.org .
|