scsi, file-posix: add support for persistent reservation management
It is a common requirement for virtual machine to send persistent reservations, but this currently requires either running QEMU with CAP_SYS_RAWIO, or using out-of-tree patches that let an unprivileged QEMU bypass Linux's filter on SG_IO commands. As an alternative mechanism, the next patches will introduce a privileged helper to run persistent reservation commands without expanding QEMU's attack surface unnecessarily. The helper is invoked through a "pr-manager" QOM object, to which file-posix.c passes SG_IO requests for PERSISTENT RESERVE OUT and PERSISTENT RESERVE IN commands. For example: $ qemu-system-x86_64 -device virtio-scsi \ -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock -drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0 -device scsi-block,drive=hd or: $ qemu-system-x86_64 -device virtio-scsi \ -object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock -blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0 -device scsi-block,drive=hd Multiple pr-manager implementations are conceivable and possible, though only one is implemented right now. For example, a pr-manager could: - talk directly to the multipath daemon from a privileged QEMU (i.e. QEMU links to libmpathpersist); this makes reservation work properly with multipath, but still requires CAP_SYS_RAWIO - use the Linux IOC_PR_* ioctls (they require CAP_SYS_ADMIN though) - more interestingly, implement reservations directly in QEMU through file system locks or a shared database (e.g. sqlite) Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
parent
092aa2fc65
commit
7c9e527659
@ -171,6 +171,7 @@ trace-events-subdirs += qapi
|
|||||||
trace-events-subdirs += accel/tcg
|
trace-events-subdirs += accel/tcg
|
||||||
trace-events-subdirs += accel/kvm
|
trace-events-subdirs += accel/kvm
|
||||||
trace-events-subdirs += nbd
|
trace-events-subdirs += nbd
|
||||||
|
trace-events-subdirs += scsi
|
||||||
|
|
||||||
trace-events-files = $(SRC_PATH)/trace-events $(trace-events-subdirs:%=$(SRC_PATH)/%/trace-events)
|
trace-events-files = $(SRC_PATH)/trace-events $(trace-events-subdirs:%=$(SRC_PATH)/%/trace-events)
|
||||||
|
|
||||||
|
@ -33,6 +33,9 @@
|
|||||||
#include "block/raw-aio.h"
|
#include "block/raw-aio.h"
|
||||||
#include "qapi/qmp/qstring.h"
|
#include "qapi/qmp/qstring.h"
|
||||||
|
|
||||||
|
#include "scsi/pr-manager.h"
|
||||||
|
#include "scsi/constants.h"
|
||||||
|
|
||||||
#if defined(__APPLE__) && (__MACH__)
|
#if defined(__APPLE__) && (__MACH__)
|
||||||
#include <paths.h>
|
#include <paths.h>
|
||||||
#include <sys/param.h>
|
#include <sys/param.h>
|
||||||
@ -155,6 +158,8 @@ typedef struct BDRVRawState {
|
|||||||
bool page_cache_inconsistent:1;
|
bool page_cache_inconsistent:1;
|
||||||
bool has_fallocate;
|
bool has_fallocate;
|
||||||
bool needs_alignment;
|
bool needs_alignment;
|
||||||
|
|
||||||
|
PRManager *pr_mgr;
|
||||||
} BDRVRawState;
|
} BDRVRawState;
|
||||||
|
|
||||||
typedef struct BDRVRawReopenState {
|
typedef struct BDRVRawReopenState {
|
||||||
@ -402,6 +407,11 @@ static QemuOptsList raw_runtime_opts = {
|
|||||||
.type = QEMU_OPT_STRING,
|
.type = QEMU_OPT_STRING,
|
||||||
.help = "file locking mode (on/off/auto, default: auto)",
|
.help = "file locking mode (on/off/auto, default: auto)",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
.name = "pr-manager",
|
||||||
|
.type = QEMU_OPT_STRING,
|
||||||
|
.help = "id of persistent reservation manager object (default: none)",
|
||||||
|
},
|
||||||
{ /* end of list */ }
|
{ /* end of list */ }
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@ -413,6 +423,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
|
|||||||
QemuOpts *opts;
|
QemuOpts *opts;
|
||||||
Error *local_err = NULL;
|
Error *local_err = NULL;
|
||||||
const char *filename = NULL;
|
const char *filename = NULL;
|
||||||
|
const char *str;
|
||||||
BlockdevAioOptions aio, aio_default;
|
BlockdevAioOptions aio, aio_default;
|
||||||
int fd, ret;
|
int fd, ret;
|
||||||
struct stat st;
|
struct stat st;
|
||||||
@ -476,6 +487,16 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
|
|||||||
abort();
|
abort();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
str = qemu_opt_get(opts, "pr-manager");
|
||||||
|
if (str) {
|
||||||
|
s->pr_mgr = pr_manager_lookup(str, &local_err);
|
||||||
|
if (local_err) {
|
||||||
|
error_propagate(errp, local_err);
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
s->open_flags = open_flags;
|
s->open_flags = open_flags;
|
||||||
raw_parse_flags(bdrv_flags, &s->open_flags);
|
raw_parse_flags(bdrv_flags, &s->open_flags);
|
||||||
|
|
||||||
@ -2597,6 +2618,15 @@ static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
|
|||||||
if (fd_open(bs) < 0)
|
if (fd_open(bs) < 0)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
if (req == SG_IO && s->pr_mgr) {
|
||||||
|
struct sg_io_hdr *io_hdr = buf;
|
||||||
|
if (io_hdr->cmdp[0] == PERSISTENT_RESERVE_OUT ||
|
||||||
|
io_hdr->cmdp[0] == PERSISTENT_RESERVE_IN) {
|
||||||
|
return pr_manager_execute(s->pr_mgr, bdrv_get_aio_context(bs),
|
||||||
|
s->fd, io_hdr, cb, opaque);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
acb = g_new(RawPosixAIOData, 1);
|
acb = g_new(RawPosixAIOData, 1);
|
||||||
acb->bs = bs;
|
acb->bs = bs;
|
||||||
acb->aio_type = QEMU_AIO_IOCTL;
|
acb->aio_type = QEMU_AIO_IOCTL;
|
||||||
|
51
docs/pr-manager.rst
Normal file
51
docs/pr-manager.rst
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
======================================
|
||||||
|
Persistent reservation managers
|
||||||
|
======================================
|
||||||
|
|
||||||
|
SCSI persistent Reservations allow restricting access to block devices
|
||||||
|
to specific initiators in a shared storage setup. When implementing
|
||||||
|
clustering of virtual machines, it is a common requirement for virtual
|
||||||
|
machines to send persistent reservation SCSI commands. However,
|
||||||
|
the operating system restricts sending these commands to unprivileged
|
||||||
|
programs because incorrect usage can disrupt regular operation of the
|
||||||
|
storage fabric.
|
||||||
|
|
||||||
|
For this reason, QEMU's SCSI passthrough devices, ``scsi-block``
|
||||||
|
and ``scsi-generic`` (both are only available on Linux) can delegate
|
||||||
|
implementation of persistent reservations to a separate object,
|
||||||
|
the "persistent reservation manager". Only PERSISTENT RESERVE OUT and
|
||||||
|
PERSISTENT RESERVE IN commands are passed to the persistent reservation
|
||||||
|
manager object; other commands are processed by QEMU as usual.
|
||||||
|
|
||||||
|
-----------------------------------------
|
||||||
|
Defining a persistent reservation manager
|
||||||
|
-----------------------------------------
|
||||||
|
|
||||||
|
A persistent reservation manager is an instance of a subclass of the
|
||||||
|
"pr-manager" QOM class.
|
||||||
|
|
||||||
|
Right now only one subclass is defined, ``pr-manager-helper``, which
|
||||||
|
forwards the commands to an external privileged helper program
|
||||||
|
over Unix sockets. The helper program only allows sending persistent
|
||||||
|
reservation commands to devices for which QEMU has a file descriptor,
|
||||||
|
so that QEMU will not be able to effect persistent reservations
|
||||||
|
unless it has access to both the socket and the device.
|
||||||
|
|
||||||
|
``pr-manager-helper`` has a single string property, ``path``, which
|
||||||
|
accepts the path to the helper program's Unix socket. For example,
|
||||||
|
the following command line defines a ``pr-manager-helper`` object and
|
||||||
|
attaches it to a SCSI passthrough device::
|
||||||
|
|
||||||
|
$ qemu-system-x86_64
|
||||||
|
-device virtio-scsi \
|
||||||
|
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
|
||||||
|
-drive if=none,id=hd,driver=raw,file.filename=/dev/sdb,file.pr-manager=helper0
|
||||||
|
-device scsi-block,drive=hd
|
||||||
|
|
||||||
|
Alternatively, using ``-blockdev``::
|
||||||
|
|
||||||
|
$ qemu-system-x86_64
|
||||||
|
-device virtio-scsi \
|
||||||
|
-object pr-manager-helper,id=helper0,path=/var/run/qemu-pr-helper.sock
|
||||||
|
-blockdev node-name=hd,driver=raw,file.driver=host_device,file.filename=/dev/sdb,file.pr-manager=helper0
|
||||||
|
-device scsi-block,drive=hd
|
56
include/scsi/pr-manager.h
Normal file
56
include/scsi/pr-manager.h
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
#ifndef PR_MANAGER_H
|
||||||
|
#define PR_MANAGER_H
|
||||||
|
|
||||||
|
#include "qom/object.h"
|
||||||
|
#include "qapi/qmp/qdict.h"
|
||||||
|
#include "qapi/visitor.h"
|
||||||
|
#include "qom/object_interfaces.h"
|
||||||
|
#include "block/aio.h"
|
||||||
|
|
||||||
|
#define TYPE_PR_MANAGER "pr-manager"
|
||||||
|
|
||||||
|
#define PR_MANAGER_CLASS(klass) \
|
||||||
|
OBJECT_CLASS_CHECK(PRManagerClass, (klass), TYPE_PR_MANAGER)
|
||||||
|
#define PR_MANAGER_GET_CLASS(obj) \
|
||||||
|
OBJECT_GET_CLASS(PRManagerClass, (obj), TYPE_PR_MANAGER)
|
||||||
|
#define PR_MANAGER(obj) \
|
||||||
|
OBJECT_CHECK(PRManager, (obj), TYPE_PR_MANAGER)
|
||||||
|
|
||||||
|
struct sg_io_hdr;
|
||||||
|
|
||||||
|
typedef struct PRManager {
|
||||||
|
/* <private> */
|
||||||
|
Object parent;
|
||||||
|
} PRManager;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PRManagerClass:
|
||||||
|
* @parent_class: the base class
|
||||||
|
* @run: callback invoked in thread pool context
|
||||||
|
*/
|
||||||
|
typedef struct PRManagerClass {
|
||||||
|
/* <private> */
|
||||||
|
ObjectClass parent_class;
|
||||||
|
|
||||||
|
/* <public> */
|
||||||
|
int (*run)(PRManager *pr_mgr, int fd, struct sg_io_hdr *hdr);
|
||||||
|
} PRManagerClass;
|
||||||
|
|
||||||
|
BlockAIOCB *pr_manager_execute(PRManager *pr_mgr,
|
||||||
|
AioContext *ctx, int fd,
|
||||||
|
struct sg_io_hdr *hdr,
|
||||||
|
BlockCompletionFunc *complete,
|
||||||
|
void *opaque);
|
||||||
|
|
||||||
|
#ifdef CONFIG_LINUX
|
||||||
|
PRManager *pr_manager_lookup(const char *id, Error **errp);
|
||||||
|
#else
|
||||||
|
static inline PRManager *pr_manager_lookup(const char *id, Error **errp)
|
||||||
|
{
|
||||||
|
/* The classes do not exist at all! */
|
||||||
|
error_setg(errp, "No persistent reservation manager with id '%s'", id);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
@ -2241,6 +2241,9 @@
|
|||||||
# Driver specific block device options for the file backend.
|
# Driver specific block device options for the file backend.
|
||||||
#
|
#
|
||||||
# @filename: path to the image file
|
# @filename: path to the image file
|
||||||
|
# @pr-manager: the id for the object that will handle persistent reservations
|
||||||
|
# for this device (default: none, forward the commands via SG_IO;
|
||||||
|
# since 2.11)
|
||||||
# @aio: AIO backend (default: threads) (since: 2.8)
|
# @aio: AIO backend (default: threads) (since: 2.8)
|
||||||
# @locking: whether to enable file locking. If set to 'auto', only enable
|
# @locking: whether to enable file locking. If set to 'auto', only enable
|
||||||
# when Open File Descriptor (OFD) locking API is available
|
# when Open File Descriptor (OFD) locking API is available
|
||||||
@ -2250,6 +2253,7 @@
|
|||||||
##
|
##
|
||||||
{ 'struct': 'BlockdevOptionsFile',
|
{ 'struct': 'BlockdevOptionsFile',
|
||||||
'data': { 'filename': 'str',
|
'data': { 'filename': 'str',
|
||||||
|
'*pr-manager': 'str',
|
||||||
'*locking': 'OnOffAuto',
|
'*locking': 'OnOffAuto',
|
||||||
'*aio': 'BlockdevAioOptions' } }
|
'*aio': 'BlockdevAioOptions' } }
|
||||||
|
|
||||||
|
@ -1 +1,3 @@
|
|||||||
block-obj-y += utils.o
|
block-obj-y += utils.o
|
||||||
|
|
||||||
|
block-obj-$(CONFIG_LINUX) += pr-manager.o
|
||||||
|
109
scsi/pr-manager.c
Normal file
109
scsi/pr-manager.c
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
/*
|
||||||
|
* Persistent reservation manager abstract class
|
||||||
|
*
|
||||||
|
* Copyright (c) 2017 Red Hat, Inc.
|
||||||
|
*
|
||||||
|
* Author: Paolo Bonzini <pbonzini@redhat.com>
|
||||||
|
*
|
||||||
|
* This code is licensed under the LGPL.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include <scsi/sg.h>
|
||||||
|
|
||||||
|
#include "qapi/error.h"
|
||||||
|
#include "block/aio.h"
|
||||||
|
#include "block/thread-pool.h"
|
||||||
|
#include "scsi/pr-manager.h"
|
||||||
|
#include "trace.h"
|
||||||
|
|
||||||
|
typedef struct PRManagerData {
|
||||||
|
PRManager *pr_mgr;
|
||||||
|
struct sg_io_hdr *hdr;
|
||||||
|
int fd;
|
||||||
|
} PRManagerData;
|
||||||
|
|
||||||
|
static int pr_manager_worker(void *opaque)
|
||||||
|
{
|
||||||
|
PRManagerData *data = opaque;
|
||||||
|
PRManager *pr_mgr = data->pr_mgr;
|
||||||
|
PRManagerClass *pr_mgr_class =
|
||||||
|
PR_MANAGER_GET_CLASS(pr_mgr);
|
||||||
|
struct sg_io_hdr *hdr = data->hdr;
|
||||||
|
int fd = data->fd;
|
||||||
|
int r;
|
||||||
|
|
||||||
|
g_free(data);
|
||||||
|
trace_pr_manager_run(fd, hdr->cmdp[0], hdr->cmdp[1]);
|
||||||
|
|
||||||
|
/* The reference was taken in pr_manager_execute. */
|
||||||
|
r = pr_mgr_class->run(pr_mgr, fd, hdr);
|
||||||
|
object_unref(OBJECT(pr_mgr));
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
BlockAIOCB *pr_manager_execute(PRManager *pr_mgr,
|
||||||
|
AioContext *ctx, int fd,
|
||||||
|
struct sg_io_hdr *hdr,
|
||||||
|
BlockCompletionFunc *complete,
|
||||||
|
void *opaque)
|
||||||
|
{
|
||||||
|
PRManagerData *data = g_new(PRManagerData, 1);
|
||||||
|
ThreadPool *pool = aio_get_thread_pool(ctx);
|
||||||
|
|
||||||
|
trace_pr_manager_execute(fd, hdr->cmdp[0], hdr->cmdp[1], opaque);
|
||||||
|
data->pr_mgr = pr_mgr;
|
||||||
|
data->fd = fd;
|
||||||
|
data->hdr = hdr;
|
||||||
|
|
||||||
|
/* The matching object_unref is in pr_manager_worker. */
|
||||||
|
object_ref(OBJECT(pr_mgr));
|
||||||
|
return thread_pool_submit_aio(pool, pr_manager_worker,
|
||||||
|
data, complete, opaque);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const TypeInfo pr_manager_info = {
|
||||||
|
.parent = TYPE_OBJECT,
|
||||||
|
.name = TYPE_PR_MANAGER,
|
||||||
|
.class_size = sizeof(PRManagerClass),
|
||||||
|
.abstract = true,
|
||||||
|
.interfaces = (InterfaceInfo[]) {
|
||||||
|
{ TYPE_USER_CREATABLE },
|
||||||
|
{ }
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
PRManager *pr_manager_lookup(const char *id, Error **errp)
|
||||||
|
{
|
||||||
|
Object *obj;
|
||||||
|
PRManager *pr_mgr;
|
||||||
|
|
||||||
|
obj = object_resolve_path_component(object_get_objects_root(), id);
|
||||||
|
if (!obj) {
|
||||||
|
error_setg(errp, "No persistent reservation manager with id '%s'", id);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
pr_mgr = (PRManager *)
|
||||||
|
object_dynamic_cast(obj,
|
||||||
|
TYPE_PR_MANAGER);
|
||||||
|
if (!pr_mgr) {
|
||||||
|
error_setg(errp,
|
||||||
|
"Object with id '%s' is not a persistent reservation manager",
|
||||||
|
id);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return pr_mgr;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pr_manager_register_types(void)
|
||||||
|
{
|
||||||
|
type_register_static(&pr_manager_info);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
type_init(pr_manager_register_types);
|
3
scsi/trace-events
Normal file
3
scsi/trace-events
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# scsi/pr-manager.c
|
||||||
|
pr_manager_execute(int fd, int cmd, int sa, void *opaque) "fd=%d cmd=0x%02x service action=0x%02x opaque=%p"
|
||||||
|
pr_manager_run(int fd, int cmd, int sa) "fd=%d cmd=0x%02x service action=0x%02x"
|
3
vl.c
3
vl.c
@ -2893,7 +2893,8 @@ static int machine_set_property(void *opaque,
|
|||||||
*/
|
*/
|
||||||
static bool object_create_initial(const char *type)
|
static bool object_create_initial(const char *type)
|
||||||
{
|
{
|
||||||
if (g_str_equal(type, "rng-egd")) {
|
if (g_str_equal(type, "rng-egd") ||
|
||||||
|
g_str_has_prefix(type, "pr-manager-")) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user