From 4bbeb8b173e8116851d5ececb93189ae34c68309 Mon Sep 17 00:00:00 2001 From: Fam Zheng Date: Wed, 16 Apr 2014 10:09:16 +0800 Subject: [PATCH 01/11] scsi-disk: Improve error messager if can't get version number More often it is that bdrv_ioctl fails due to not supported by driver or whatever reason, in this case we should be specific, because "interface too old" is very confusing. Signed-off-by: Fam Zheng Signed-off-by: Paolo Bonzini --- hw/scsi/scsi-disk.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index 48a28ae199..d2e532e0ee 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -2463,8 +2463,13 @@ static int scsi_block_initfn(SCSIDevice *dev) } /* check we are using a driver managing SG_IO (version 3 and after) */ - if (bdrv_ioctl(s->qdev.conf.bs, SG_GET_VERSION_NUM, &sg_version) < 0 || - sg_version < 30000) { + rc = bdrv_ioctl(s->qdev.conf.bs, SG_GET_VERSION_NUM, &sg_version); + if (rc < 0) { + error_report("scsi-block: can not get version number: %s", + strerror(-rc)); + return -1; + } + if (sg_version < 30000) { error_report("scsi-block: scsi generic interface too old"); return -1; } From 6ee143a0a4f8b5c437ac327e3d694a6a0e5380ad Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 28 Apr 2014 12:14:57 +0200 Subject: [PATCH 02/11] scsi: Improve error messages more Remove the "scsi-block:" prefix for error messages as suggested by Markus. Improve the previous patch by making the message the same for both scsi-block and scsi-generic, including the strerror() output in both and making an explicit reference to SG_IO. Also s/can not/cannot/. Signed-off-by: Paolo Bonzini --- hw/scsi/scsi-disk.c | 11 ++++++----- hw/scsi/scsi-generic.c | 8 ++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c index d2e532e0ee..4bcef551a6 100644 --- a/hw/scsi/scsi-disk.c +++ b/hw/scsi/scsi-disk.c @@ -2458,26 +2458,27 @@ static int scsi_block_initfn(SCSIDevice *dev) int rc; if (!s->qdev.conf.bs) { - error_report("scsi-block: drive property not set"); + error_report("drive property not set"); return -1; } /* check we are using a driver managing SG_IO (version 3 and after) */ - rc = bdrv_ioctl(s->qdev.conf.bs, SG_GET_VERSION_NUM, &sg_version); + rc = bdrv_ioctl(s->qdev.conf.bs, SG_GET_VERSION_NUM, &sg_version); if (rc < 0) { - error_report("scsi-block: can not get version number: %s", + error_report("cannot get SG_IO version number: %s. " + "Is this a SCSI device?", strerror(-rc)); return -1; } if (sg_version < 30000) { - error_report("scsi-block: scsi generic interface too old"); + error_report("scsi generic interface too old"); return -1; } /* get device type from INQUIRY data */ rc = get_device_type(s); if (rc < 0) { - error_report("scsi-block: INQUIRY failed"); + error_report("INQUIRY failed"); return -1; } diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c index 8d92e0da15..3733d2c36c 100644 --- a/hw/scsi/scsi-generic.c +++ b/hw/scsi/scsi-generic.c @@ -394,6 +394,7 @@ static void scsi_destroy(SCSIDevice *s) static int scsi_generic_initfn(SCSIDevice *s) { + int rc; int sg_version; struct sg_scsi_id scsiid; @@ -412,8 +413,11 @@ static int scsi_generic_initfn(SCSIDevice *s) } /* check we are using a driver managing SG_IO (version 3 and after */ - if (bdrv_ioctl(s->conf.bs, SG_GET_VERSION_NUM, &sg_version) < 0) { - error_report("scsi generic interface not supported"); + rc = bdrv_ioctl(s->conf.bs, SG_GET_VERSION_NUM, &sg_version); + if (rc < 0) { + error_report("cannot get SG_IO version number: %s. " + "Is this a SCSI device?", + strerror(-rc)); return -1; } if (sg_version < 30000) { From 34bb4d02e00e508fa9d111a6a31b45bbfecbdba5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 16 Apr 2014 16:44:13 +0200 Subject: [PATCH 03/11] megasas: Implement LD_LIST_QUERY Newer firmware implement a LD_LIST_QUERY command, and due to a driver issue no drives might be detected if this command isn't supported. So add emulation for this command, too. Cc: qemu-stable@nongnu.org Signed-off-by: Hannes Reinecke Signed-off-by: Paolo Bonzini --- hw/scsi/megasas.c | 17 +++++++++++++++++ hw/scsi/mfi.h | 9 +++++++++ trace-events | 1 + 3 files changed, 27 insertions(+) diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c index e6e1ffd1bb..17815251f7 100644 --- a/hw/scsi/megasas.c +++ b/hw/scsi/megasas.c @@ -1106,6 +1106,21 @@ static int megasas_dcmd_ld_get_list(MegasasState *s, MegasasCmd *cmd) return MFI_STAT_OK; } +static int megasas_dcmd_ld_list_query(MegasasState *s, MegasasCmd *cmd) +{ + uint16_t flags; + + /* mbox0 contains flags */ + flags = le16_to_cpu(cmd->frame->dcmd.mbox[0]); + trace_megasas_dcmd_ld_list_query(cmd->index, flags); + if (flags == MR_LD_QUERY_TYPE_ALL || + flags == MR_LD_QUERY_TYPE_EXPOSED_TO_HOST) { + return megasas_dcmd_ld_get_list(s, cmd); + } + + return MFI_STAT_OK; +} + static int megasas_ld_get_info_submit(SCSIDevice *sdev, int lun, MegasasCmd *cmd) { @@ -1409,6 +1424,8 @@ static const struct dcmd_cmd_tbl_t { megasas_dcmd_dummy }, { MFI_DCMD_LD_GET_LIST, "LD_GET_LIST", megasas_dcmd_ld_get_list}, + { MFI_DCMD_LD_LIST_QUERY, "LD_LIST_QUERY", + megasas_dcmd_ld_list_query }, { MFI_DCMD_LD_GET_INFO, "LD_GET_INFO", megasas_dcmd_ld_get_info }, { MFI_DCMD_LD_GET_PROP, "LD_GET_PROP", diff --git a/hw/scsi/mfi.h b/hw/scsi/mfi.h index cd8355badf..a3034f6239 100644 --- a/hw/scsi/mfi.h +++ b/hw/scsi/mfi.h @@ -164,6 +164,7 @@ typedef enum { MFI_DCMD_PD_BLINK = 0x02070100, MFI_DCMD_PD_UNBLINK = 0x02070200, MFI_DCMD_LD_GET_LIST = 0x03010000, + MFI_DCMD_LD_LIST_QUERY = 0x03010100, MFI_DCMD_LD_GET_INFO = 0x03020000, MFI_DCMD_LD_GET_PROP = 0x03030000, MFI_DCMD_LD_SET_PROP = 0x03040000, @@ -411,6 +412,14 @@ typedef enum { MR_PD_QUERY_TYPE_EXPOSED_TO_HOST = 5, /*query for system drives */ } mfi_pd_query_type; +typedef enum { + MR_LD_QUERY_TYPE_ALL = 0, + MR_LD_QUERY_TYPE_EXPOSED_TO_HOST = 1, + MR_LD_QUERY_TYPE_USED_TGT_IDS = 2, + MR_LD_QUERY_TYPE_CLUSTER_ACCESS = 3, + MR_LD_QUERY_TYPE_CLUSTER_LOCALE = 4, +} mfi_ld_query_type; + /* * Other propertities and definitions */ diff --git a/trace-events b/trace-events index 6ecaab2f27..96510b38a3 100644 --- a/trace-events +++ b/trace-events @@ -685,6 +685,7 @@ megasas_dcmd_ld_get_list(int cmd, int num, int max) "scmd %d: DCMD LD get list: megasas_dcmd_ld_get_info(int cmd, int ld_id) "scmd %d: DCMD LD get info for dev %d" megasas_dcmd_pd_get_info(int cmd, int pd_id) "scmd %d: DCMD PD get info for dev %d" megasas_dcmd_pd_list_query(int cmd, int flags) "scmd %d: DCMD PD list query flags %x" +megasas_dcmd_ld_list_query(int cmd, int flags) "scmd %d: DCMD LD list query flags %x" megasas_dcmd_unsupported(int cmd, unsigned long size) "scmd %d: set properties len %ld" megasas_abort_frame(int cmd, int abort_cmd) "scmd %d: aborting frame %x" megasas_abort_no_cmd(int cmd, uint64_t context) "scmd %d: no active command for frame context %" PRIx64 "" From 23335f6273518925614f5ed2ccd71dabd07413ca Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 16 Apr 2014 16:44:14 +0200 Subject: [PATCH 04/11] megasas: Enable MSI-X support MSI-X support has been fixed in qemu, so we can enable it again. Signed-off-by: Hannes Reinecke [Do not change VMSTATE_PCI_DEVICE to PCIE. - Paolo] Signed-off-by: Paolo Bonzini --- hw/scsi/megasas.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c index 17815251f7..c40e48bc3e 100644 --- a/hw/scsi/megasas.c +++ b/hw/scsi/megasas.c @@ -2085,6 +2085,7 @@ static const VMStateDescription vmstate_megasas = { .minimum_version_id_old = 0, .fields = (VMStateField[]) { VMSTATE_PCI_DEVICE(parent_obj, MegasasState), + VMSTATE_MSIX(parent_obj, MegasasState), VMSTATE_INT32(fw_state, MegasasState), VMSTATE_INT32(intr_mask, MegasasState), @@ -2100,9 +2101,7 @@ static void megasas_scsi_uninit(PCIDevice *d) { MegasasState *s = MEGASAS(d); -#ifdef USE_MSIX - msix_uninit(d, &s->mmio_io); -#endif + msix_uninit(d, &s->mmio_io, &s->mmio_io); memory_region_destroy(&s->mmio_io); memory_region_destroy(&s->port_io); memory_region_destroy(&s->queue_io); @@ -2141,15 +2140,11 @@ static int megasas_scsi_init(PCIDevice *dev) memory_region_init_io(&s->queue_io, OBJECT(s), &megasas_queue_ops, s, "megasas-queue", 0x40000); -#ifdef USE_MSIX - /* MSI-X support is currently broken */ if (megasas_use_msix(s) && - msix_init(dev, 15, &s->mmio_io, 0, 0x2000)) { + msix_init(dev, 15, &s->mmio_io, 0, 0x2000, + &s->mmio_io, 0, 0x3800, 0x68)) { s->flags &= ~MEGASAS_MASK_USE_MSIX; } -#else - s->flags &= ~MEGASAS_MASK_USE_MSIX; -#endif bar_type = PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64; pci_register_bar(dev, 0, bar_type, &s->mmio_io); @@ -2168,7 +2163,7 @@ static int megasas_scsi_init(PCIDevice *dev) s->sas_addr |= PCI_FUNC(dev->devfn); } if (!s->hba_serial) { - s->hba_serial = g_strdup(MEGASAS_HBA_SERIAL); + s->hba_serial = g_strdup(MEGASAS_HBA_SERIAL); } if (s->fw_sge >= MEGASAS_MAX_SGE - MFI_PASS_FRAME_SIZE) { s->fw_sge = MEGASAS_MAX_SGE - MFI_PASS_FRAME_SIZE; @@ -2213,10 +2208,8 @@ static Property megasas_properties[] = { MEGASAS_DEFAULT_FRAMES), DEFINE_PROP_STRING("hba_serial", MegasasState, hba_serial), DEFINE_PROP_UINT64("sas_address", MegasasState, sas_addr, 0), -#ifdef USE_MSIX DEFINE_PROP_BIT("use_msix", MegasasState, flags, MEGASAS_FLAG_USE_MSIX, false), -#endif DEFINE_PROP_BIT("use_jbod", MegasasState, flags, MEGASAS_FLAG_USE_JBOD, false), DEFINE_PROP_END_OF_LIST(), From 4522b69c6cb9396e4ea8dc03b68f3ee7cee32f90 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 16 Apr 2014 16:44:15 +0200 Subject: [PATCH 05/11] megasas: Add MSI support Some hardware instances do support MSI, so we should do likewise. Signed-off-by: Hannes Reinecke Signed-off-by: Paolo Bonzini --- hw/scsi/megasas.c | 49 +++++++++++++++++++++++++++++++++++++++++------ trace-events | 5 ++++- 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c index c40e48bc3e..baee46f981 100644 --- a/hw/scsi/megasas.c +++ b/hw/scsi/megasas.c @@ -21,6 +21,7 @@ #include "hw/hw.h" #include "hw/pci/pci.h" #include "sysemu/dma.h" +#include "hw/pci/msi.h" #include "hw/pci/msix.h" #include "qemu/iov.h" #include "hw/scsi/scsi.h" @@ -43,9 +44,11 @@ #define MEGASAS_FLAG_USE_JBOD 0 #define MEGASAS_MASK_USE_JBOD (1 << MEGASAS_FLAG_USE_JBOD) -#define MEGASAS_FLAG_USE_MSIX 1 +#define MEGASAS_FLAG_USE_MSI 1 +#define MEGASAS_MASK_USE_MSI (1 << MEGASAS_FLAG_USE_MSI) +#define MEGASAS_FLAG_USE_MSIX 2 #define MEGASAS_MASK_USE_MSIX (1 << MEGASAS_FLAG_USE_MSIX) -#define MEGASAS_FLAG_USE_QUEUE64 2 +#define MEGASAS_FLAG_USE_QUEUE64 3 #define MEGASAS_MASK_USE_QUEUE64 (1 << MEGASAS_FLAG_USE_QUEUE64) static const char *mfi_frame_desc[] = { @@ -132,6 +135,11 @@ static bool megasas_use_queue64(MegasasState *s) return s->flags & MEGASAS_MASK_USE_QUEUE64; } +static bool megasas_use_msi(MegasasState *s) +{ + return s->flags & MEGASAS_MASK_USE_MSI; +} + static bool megasas_use_msix(MegasasState *s) { return s->flags & MEGASAS_MASK_USE_MSIX; @@ -538,6 +546,9 @@ static void megasas_complete_frame(MegasasState *s, uint64_t context) if (msix_enabled(pci_dev)) { trace_megasas_msix_raise(0); msix_notify(pci_dev, 0); + } else if (msi_enabled(pci_dev)) { + trace_megasas_msi_raise(0); + msi_notify(pci_dev, 0); } else { trace_megasas_irq_raise(); pci_irq_assert(pci_dev); @@ -1956,12 +1967,20 @@ static void megasas_mmio_write(void *opaque, hwaddr addr, break; case MFI_OMSK: s->intr_mask = val; - if (!megasas_intr_enabled(s) && !msix_enabled(pci_dev)) { + if (!megasas_intr_enabled(s) && + !msi_enabled(pci_dev) && + !msix_enabled(pci_dev)) { trace_megasas_irq_lower(); pci_irq_deassert(pci_dev); } if (megasas_intr_enabled(s)) { - trace_megasas_intr_enabled(); + if (msix_enabled(pci_dev)) { + trace_megasas_msix_enabled(0); + } else if (msi_enabled(pci_dev)) { + trace_megasas_msi_enabled(0); + } else { + trace_megasas_intr_enabled(); + } } else { trace_megasas_intr_disabled(); } @@ -2101,7 +2120,12 @@ static void megasas_scsi_uninit(PCIDevice *d) { MegasasState *s = MEGASAS(d); - msix_uninit(d, &s->mmio_io, &s->mmio_io); + if (megasas_use_msix(s)) { + msix_uninit(d, &s->mmio_io, &s->mmio_io); + } + if (megasas_use_msi(s)) { + msi_uninit(d); + } memory_region_destroy(&s->mmio_io); memory_region_destroy(&s->port_io); memory_region_destroy(&s->queue_io); @@ -2140,6 +2164,10 @@ static int megasas_scsi_init(PCIDevice *dev) memory_region_init_io(&s->queue_io, OBJECT(s), &megasas_queue_ops, s, "megasas-queue", 0x40000); + if (megasas_use_msi(s) && + msi_init(dev, 0x50, 1, true, false)) { + s->flags &= ~MEGASAS_MASK_USE_MSI; + } if (megasas_use_msix(s) && msix_init(dev, 15, &s->mmio_io, 0, 0x2000, &s->mmio_io, 0, 0x3800, 0x68)) { @@ -2176,7 +2204,6 @@ static int megasas_scsi_init(PCIDevice *dev) s->fw_cmds = MEGASAS_MAX_FRAMES; } trace_megasas_init(s->fw_sge, s->fw_cmds, - megasas_use_msix(s) ? "MSI-X" : "INTx", megasas_is_jbod(s) ? "jbod" : "raid"); s->fw_luns = (MFI_MAX_LD > MAX_SCSI_DEVS) ? MAX_SCSI_DEVS : MFI_MAX_LD; @@ -2201,6 +2228,13 @@ static int megasas_scsi_init(PCIDevice *dev) return 0; } +static void +megasas_write_config(PCIDevice *pci, uint32_t addr, uint32_t val, int len) +{ + pci_default_write_config(pci, addr, val, len); + msi_write_config(pci, addr, val, len); +} + static Property megasas_properties[] = { DEFINE_PROP_UINT32("max_sge", MegasasState, fw_sge, MEGASAS_DEFAULT_SGE), @@ -2208,6 +2242,8 @@ static Property megasas_properties[] = { MEGASAS_DEFAULT_FRAMES), DEFINE_PROP_STRING("hba_serial", MegasasState, hba_serial), DEFINE_PROP_UINT64("sas_address", MegasasState, sas_addr, 0), + DEFINE_PROP_BIT("use_msi", MegasasState, flags, + MEGASAS_FLAG_USE_MSI, false), DEFINE_PROP_BIT("use_msix", MegasasState, flags, MEGASAS_FLAG_USE_MSIX, false), DEFINE_PROP_BIT("use_jbod", MegasasState, flags, @@ -2232,6 +2268,7 @@ static void megasas_class_init(ObjectClass *oc, void *data) dc->vmsd = &vmstate_megasas; set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); dc->desc = "LSI MegaRAID SAS 1078"; + pc->config_write = megasas_write_config; } static const TypeInfo megasas_info = { diff --git a/trace-events b/trace-events index 96510b38a3..41d93e1e01 100644 --- a/trace-events +++ b/trace-events @@ -691,12 +691,15 @@ megasas_abort_frame(int cmd, int abort_cmd) "scmd %d: aborting frame %x" megasas_abort_no_cmd(int cmd, uint64_t context) "scmd %d: no active command for frame context %" PRIx64 "" megasas_abort_invalid_context(int cmd, uint64_t context, int abort_cmd) "scmd %d: invalid frame context %" PRIx64 " for abort frame %x" megasas_reset(void) "Reset" -megasas_init(int sges, int cmds, const char *intr, const char *mode) "Using %d sges, %d cmds, %s, %s mode" +megasas_init(int sges, int cmds, const char *mode) "Using %d sges, %d cmds, %s mode" megasas_msix_raise(int vector) "vector %d" +megasas_msi_raise(int vector) "vector %d" megasas_irq_lower(void) "INTx" megasas_irq_raise(void) "INTx" megasas_intr_enabled(void) "Interrupts enabled" megasas_intr_disabled(void) "Interrupts disabled" +megasas_msix_enabled(int vector) "vector %d" +megasas_msi_enabled(int vector) "vector %d" megasas_mmio_readl(unsigned long addr, uint32_t val) "addr 0x%lx: 0x%x" megasas_mmio_invalid_readl(unsigned long addr) "addr 0x%lx" megasas_mmio_writel(uint32_t addr, uint32_t val) "addr 0x%x: 0x%x" From d383c625e219b3f53651e1f64c88f3fa30348af7 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 16 Apr 2014 16:44:19 +0200 Subject: [PATCH 06/11] MAINTAINERS: mark megasas as maintained Signed-off-by: Hannes Reinecke Signed-off-by: Paolo Bonzini --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index c66946ff07..f1d3f091d5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -651,6 +651,12 @@ S: Supported F: hw/block/nvme* F: tests/nvme-test.c +megasas +M: Hannes Reinecke +S: Supported +F: hw/scsi/megasas.c +F: hw/scsi/mfi.h + Xilinx EDK M: Peter Crosthwaite M: Edgar E. Iglesias From dbe5c58f2a44f49572b3346400779fac818fcaea Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 28 Apr 2014 13:23:25 +0200 Subject: [PATCH 07/11] block/iscsi: allow fall back to WRITE SAME without UNMAP if the iscsi driver receives a write zeroes request with the BDRV_REQ_MAY_UNMAP flag set it fails with -ENOTSUP if the iscsi target does not support WRITE SAME with UNMAP. However, the BDRV_REQ_MAY_UNMAP is only a hint and writing zeroes with WRITE SAME will still be better than falling back to writing zeroes with WRITE16. Signed-off-by: Peter Lieven Signed-off-by: Paolo Bonzini --- block/iscsi.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index a636ea4f53..56f54193c8 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -809,13 +809,14 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, return -EINVAL; } - if (!(flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->has_write_same) { - /* WRITE SAME without UNMAP is not supported by the target */ - return -ENOTSUP; + if ((flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->lbp.lbpws) { + /* WRITE SAME with UNMAP is not supported by the target, + * fall back and try WRITE SAME without UNMAP */ + flags &= ~BDRV_REQ_MAY_UNMAP; } - if ((flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->lbp.lbpws) { - /* WRITE SAME with UNMAP is not supported by the target */ + if (!(flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->has_write_same) { + /* WRITE SAME without UNMAP is not supported by the target */ return -ENOTSUP; } From b03c38057b7ac4ffb60fa98a26dd4c8d5fa9c54c Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 28 Apr 2014 13:11:32 +0200 Subject: [PATCH 08/11] block/iscsi: speed up read for unallocated sectors this patch implements a cache that tracks if a page on the iscsi target is allocated or not. The cache is implemented in a way that it allows for false positives (e.g. pretending a page is allocated, but it isn't), but no false negatives. The cached allocation info is then used to speed up the read process for unallocated sectors by issueing a GET_LBA_STATUS request for all sectors that are not yet known to be allocated. If the read request is confirmed to fall into an unallocated range we directly return zeroes and do not transfer the data over the wire. Tests have shown that a relatively small amount of GET_LBA_STATUS requests happens a vServer boot time to fill the allocation cache (all those blocks are not queried again). Not to transfer all the data of unallocated sectors saves a lot of time, bandwidth and storage I/O load during block jobs or storage migration and it saves a lot of bandwidth as well for any big sequential read of the whole disk (e.g. block copy or speed tests) if a significant number of blocks is unallocated. Signed-off-by: Peter Lieven Signed-off-by: Paolo Bonzini --- block/iscsi.c | 300 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 198 insertions(+), 102 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index 56f54193c8..6b8a0a34f7 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -30,6 +30,8 @@ #include "qemu-common.h" #include "qemu/config-file.h" #include "qemu/error-report.h" +#include "qemu/bitops.h" +#include "qemu/bitmap.h" #include "block/block_int.h" #include "trace.h" #include "block/scsi.h" @@ -59,6 +61,8 @@ typedef struct IscsiLun { struct scsi_inquiry_logical_block_provisioning lbp; struct scsi_inquiry_block_limits bl; unsigned char *zeroblock; + unsigned long *allocationmap; + int cluster_sectors; } IscsiLun; typedef struct IscsiTask { @@ -91,6 +95,7 @@ typedef struct IscsiAIOCB { #define NOP_INTERVAL 5000 #define MAX_NOP_FAILURES 3 #define ISCSI_CMD_RETRIES 5 +#define ISCSI_CHECKALLOC_THRES 63 static void iscsi_bh_cb(void *p) @@ -273,6 +278,32 @@ static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors, return 1; } +static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num, + int nb_sectors) +{ + if (iscsilun->allocationmap == NULL) { + return; + } + bitmap_set(iscsilun->allocationmap, + sector_num / iscsilun->cluster_sectors, + DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors)); +} + +static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num, + int nb_sectors) +{ + int64_t cluster_num, nb_clusters; + if (iscsilun->allocationmap == NULL) { + return; + } + cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors); + nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors + - cluster_num; + if (nb_clusters > 0) { + bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters); + } +} + static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov) @@ -336,9 +367,127 @@ retry: return -EIO; } + iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); + return 0; } + +static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun, + int64_t sector_num, int nb_sectors) +{ + unsigned long size; + if (iscsilun->allocationmap == NULL) { + return true; + } + size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors); + return !(find_next_bit(iscsilun->allocationmap, size, + sector_num / iscsilun->cluster_sectors) == size); +} + + +#if defined(LIBISCSI_FEATURE_IOVECTOR) + +static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + IscsiLun *iscsilun = bs->opaque; + struct scsi_get_lba_status *lbas = NULL; + struct scsi_lba_status_descriptor *lbasd = NULL; + struct IscsiTask iTask; + int64_t ret; + + iscsi_co_init_iscsitask(iscsilun, &iTask); + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + ret = -EINVAL; + goto out; + } + + /* default to all sectors allocated */ + ret = BDRV_BLOCK_DATA; + ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; + *pnum = nb_sectors; + + /* LUN does not support logical block provisioning */ + if (iscsilun->lbpme == 0) { + goto out; + } + +retry: + if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, + sector_qemu2lun(sector_num, iscsilun), + 8 + 16, iscsi_co_generic_cb, + &iTask) == NULL) { + ret = -ENOMEM; + goto out; + } + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.do_retry) { + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + iTask.complete = 0; + goto retry; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + /* in case the get_lba_status_callout fails (i.e. + * because the device is busy or the cmd is not + * supported) we pretend all blocks are allocated + * for backwards compatibility */ + goto out; + } + + lbas = scsi_datain_unmarshall(iTask.task); + if (lbas == NULL) { + ret = -EIO; + goto out; + } + + lbasd = &lbas->descriptors[0]; + + if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { + ret = -EIO; + goto out; + } + + *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); + + if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || + lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { + ret &= ~BDRV_BLOCK_DATA; + if (iscsilun->lbprz) { + ret |= BDRV_BLOCK_ZERO; + } + } + + if (ret & BDRV_BLOCK_ZERO) { + iscsi_allocationmap_clear(iscsilun, sector_num, *pnum); + } else { + iscsi_allocationmap_set(iscsilun, sector_num, *pnum); + } + + if (*pnum > nb_sectors) { + *pnum = nb_sectors; + } +out: + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + } + return ret; +} + +#endif /* LIBISCSI_FEATURE_IOVECTOR */ + + static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov) @@ -355,6 +504,22 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, return -EINVAL; } +#if defined(LIBISCSI_FEATURE_IOVECTOR) + if (iscsilun->lbprz && nb_sectors > ISCSI_CHECKALLOC_THRES && + !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) { + int64_t ret; + int pnum; + ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum); + if (ret < 0) { + return ret; + } + if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors) { + qemu_iovec_memset(iov, 0, 0x00, iov->size); + return 0; + } + } +#endif + lba = sector_qemu2lun(sector_num, iscsilun); num_sectors = sector_qemu2lun(nb_sectors, iscsilun); @@ -643,101 +808,6 @@ iscsi_getlength(BlockDriverState *bs) return len; } -#if defined(LIBISCSI_FEATURE_IOVECTOR) - -static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum) -{ - IscsiLun *iscsilun = bs->opaque; - struct scsi_get_lba_status *lbas = NULL; - struct scsi_lba_status_descriptor *lbasd = NULL; - struct IscsiTask iTask; - int64_t ret; - - iscsi_co_init_iscsitask(iscsilun, &iTask); - - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - ret = -EINVAL; - goto out; - } - - /* default to all sectors allocated */ - ret = BDRV_BLOCK_DATA; - ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; - *pnum = nb_sectors; - - /* LUN does not support logical block provisioning */ - if (iscsilun->lbpme == 0) { - goto out; - } - -retry: - if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, - sector_qemu2lun(sector_num, iscsilun), - 8 + 16, iscsi_co_generic_cb, - &iTask) == NULL) { - ret = -ENOMEM; - goto out; - } - - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_coroutine_yield(); - } - - if (iTask.do_retry) { - if (iTask.task != NULL) { - scsi_free_scsi_task(iTask.task); - iTask.task = NULL; - } - iTask.complete = 0; - goto retry; - } - - if (iTask.status != SCSI_STATUS_GOOD) { - /* in case the get_lba_status_callout fails (i.e. - * because the device is busy or the cmd is not - * supported) we pretend all blocks are allocated - * for backwards compatibility */ - goto out; - } - - lbas = scsi_datain_unmarshall(iTask.task); - if (lbas == NULL) { - ret = -EIO; - goto out; - } - - lbasd = &lbas->descriptors[0]; - - if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { - ret = -EIO; - goto out; - } - - *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); - if (*pnum > nb_sectors) { - *pnum = nb_sectors; - } - - if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || - lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { - ret &= ~BDRV_BLOCK_DATA; - if (iscsilun->lbprz) { - ret |= BDRV_BLOCK_ZERO; - } - } - -out: - if (iTask.task != NULL) { - scsi_free_scsi_task(iTask.task); - } - return ret; -} - -#endif /* LIBISCSI_FEATURE_IOVECTOR */ - static int coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) @@ -791,6 +861,8 @@ retry: return -EIO; } + iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); + return 0; } @@ -865,6 +937,12 @@ retry: return -EIO; } + if (flags & BDRV_REQ_MAY_UNMAP) { + iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); + } else { + iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); + } + return 0; } @@ -1297,6 +1375,22 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); #endif + /* Guess the internal cluster (page) size of the iscsi target by the means + * of opt_unmap_gran. Transfer the unmap granularity only if it has a + * reasonable size */ + if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 && + iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { + iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran * + iscsilun->block_size) >> BDRV_SECTOR_BITS; +#if defined(LIBISCSI_FEATURE_IOVECTOR) + if (iscsilun->lbprz && !(bs->open_flags & BDRV_O_NOCACHE)) { + iscsilun->allocationmap = + bitmap_new(DIV_ROUND_UP(bs->total_sectors, + iscsilun->cluster_sectors)); + } +#endif + } + out: qemu_opts_del(opts); if (initiator_name != NULL) { @@ -1330,6 +1424,7 @@ static void iscsi_close(BlockDriverState *bs) qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL); iscsi_destroy_context(iscsi); g_free(iscsilun->zeroblock); + g_free(iscsilun->allocationmap); memset(iscsilun, 0, sizeof(IscsiLun)); } @@ -1390,6 +1485,13 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset) return -EINVAL; } + if (iscsilun->allocationmap != NULL) { + g_free(iscsilun->allocationmap); + iscsilun->allocationmap = + bitmap_new(DIV_ROUND_UP(bs->total_sectors, + iscsilun->cluster_sectors)); + } + return 0; } @@ -1452,13 +1554,7 @@ static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) IscsiLun *iscsilun = bs->opaque; bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz; bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws; - /* Guess the internal cluster (page) size of the iscsi target by the means - * of opt_unmap_gran. Transfer the unmap granularity only if it has a - * reasonable size for bdi->cluster_size */ - if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 && - iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { - bdi->cluster_size = iscsilun->bl.opt_unmap_gran * iscsilun->block_size; - } + bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE; return 0; } From 5917af812e9c4bd6500927b26efe8d3e2f267bd0 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 28 Apr 2014 17:11:33 +0200 Subject: [PATCH 09/11] block/iscsi: clarify the meaning of ISCSI_CHECKALLOC_THRES Signed-off-by: Peter Lieven Signed-off-by: Paolo Bonzini --- block/iscsi.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index 6b8a0a34f7..e8d26bb781 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -95,7 +95,15 @@ typedef struct IscsiAIOCB { #define NOP_INTERVAL 5000 #define MAX_NOP_FAILURES 3 #define ISCSI_CMD_RETRIES 5 -#define ISCSI_CHECKALLOC_THRES 63 + +/* this threshhold is a trade-off knob to choose between + * the potential additional overhead of an extra GET_LBA_STATUS request + * vs. unnecessarily reading a lot of zero sectors over the wire. + * If a read request is greater or equal than ISCSI_CHECKALLOC_THRES + * sectors we check the allocation status of the area covered by the + * request first if the allocationmap indicates that the area might be + * unallocated. */ +#define ISCSI_CHECKALLOC_THRES 64 static void iscsi_bh_cb(void *p) @@ -505,7 +513,7 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, } #if defined(LIBISCSI_FEATURE_IOVECTOR) - if (iscsilun->lbprz && nb_sectors > ISCSI_CHECKALLOC_THRES && + if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES && !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) { int64_t ret; int pnum; From 3d2acaa308bfab65329ef983654b302899bfb2b0 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Mon, 28 Apr 2014 17:18:32 +0200 Subject: [PATCH 10/11] block/iscsi: allow cluster_size of 4K and greater depending on the target the opt_unmap_gran might be as low as 4K. As we know use this also as a knob to activate the allocationmap feature lower the barrier. The limit 4K (and not 512) is choosen to avoid a potentially too big allocationmap. Signed-off-by: Peter Lieven Signed-off-by: Paolo Bonzini --- block/iscsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/iscsi.c b/block/iscsi.c index e8d26bb781..84bedfa8fc 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -1386,7 +1386,7 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, /* Guess the internal cluster (page) size of the iscsi target by the means * of opt_unmap_gran. Transfer the unmap granularity only if it has a * reasonable size */ - if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 && + if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 4 * 1024 && iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran * iscsilun->block_size) >> BDRV_SECTOR_BITS; From 6a86dec61921163b6ab582df988416a6f0ca0ed5 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Sun, 4 May 2014 21:36:08 +0200 Subject: [PATCH 11/11] [PATCH] block/iscsi: bump year in copyright notice Signed-off-by: Peter Lieven --- block/iscsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/iscsi.c b/block/iscsi.c index 84bedfa8fc..65bf97d674 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -2,7 +2,7 @@ * QEMU Block driver for iSCSI images * * Copyright (c) 2010-2011 Ronnie Sahlberg - * Copyright (c) 2012-2013 Peter Lieven + * Copyright (c) 2012-2014 Peter Lieven * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal