qemu/system/dma-helpers.c

348 lines
9.8 KiB
C
Raw Normal View History

/*
* DMA helper functions
*
* Copyright (c) 2009,2020 Red Hat
*
* This work is licensed under the terms of the GNU General Public License
* (GNU GPL), version 2 or later.
*/
#include "qemu/osdep.h"
#include "sysemu/block-backend.h"
#include "sysemu/dma.h"
#include "trace/trace-root.h"
#include "qemu/thread.h"
#include "qemu/main-loop.h"
#include "sysemu/cpu-timers.h"
#include "qemu/range.h"
/* #define DEBUG_IOMMU */
MemTxResult dma_memory_set(AddressSpace *as, dma_addr_t addr,
uint8_t c, dma_addr_t len, MemTxAttrs attrs)
iommu: Add universal DMA helper functions Not that long ago, every device implementation using DMA directly accessed guest memory using cpu_physical_memory_*(). This meant that adding support for a guest visible IOMMU would require changing every one of these devices to go through IOMMU translation. Shortly before qemu 1.0, I made a start on fixing this by providing helper functions for PCI DMA. These are currently just stubs which call the direct access functions, but mean that an IOMMU can be implemented in one place, rather than for every PCI device. Clearly, this doesn't help for non PCI devices, which could also be IOMMU translated on some platforms. It is also problematic for the devices which have both PCI and non-PCI version (e.g. OHCI, AHCI) - we cannot use the the pci_dma_*() functions, because they assume the presence of a PCIDevice, but we don't want to have to check between pci_dma_*() and cpu_physical_memory_*() every time we do a DMA in the device code. This patch makes the first step on addressing both these problems, by introducing new (stub) dma helper functions which can be used for any DMA capable device. These dma functions take a DMAContext *, a new (currently empty) variable describing the DMA address space in which the operation is to take place. NULL indicates untranslated DMA directly into guest physical address space. The intention is that in future non-NULL values will given information about any necessary IOMMU translation. DMA using devices must obtain a DMAContext (or, potentially, contexts) from their bus or platform. For now this patch just converts the PCI wrappers to be implemented in terms of the universal wrappers, converting other drivers can take place over time. Cc: Michael S. Tsirkin <mst@redhat.com> Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro> Cc: Richard Henderson <rth@twiddle.net> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2012-06-27 08:50:38 +04:00
{
dma_barrier(as, DMA_DIRECTION_FROM_DEVICE);
return address_space_set(as, addr, c, len, attrs);
iommu: Add universal DMA helper functions Not that long ago, every device implementation using DMA directly accessed guest memory using cpu_physical_memory_*(). This meant that adding support for a guest visible IOMMU would require changing every one of these devices to go through IOMMU translation. Shortly before qemu 1.0, I made a start on fixing this by providing helper functions for PCI DMA. These are currently just stubs which call the direct access functions, but mean that an IOMMU can be implemented in one place, rather than for every PCI device. Clearly, this doesn't help for non PCI devices, which could also be IOMMU translated on some platforms. It is also problematic for the devices which have both PCI and non-PCI version (e.g. OHCI, AHCI) - we cannot use the the pci_dma_*() functions, because they assume the presence of a PCIDevice, but we don't want to have to check between pci_dma_*() and cpu_physical_memory_*() every time we do a DMA in the device code. This patch makes the first step on addressing both these problems, by introducing new (stub) dma helper functions which can be used for any DMA capable device. These dma functions take a DMAContext *, a new (currently empty) variable describing the DMA address space in which the operation is to take place. NULL indicates untranslated DMA directly into guest physical address space. The intention is that in future non-NULL values will given information about any necessary IOMMU translation. DMA using devices must obtain a DMAContext (or, potentially, contexts) from their bus or platform. For now this patch just converts the PCI wrappers to be implemented in terms of the universal wrappers, converting other drivers can take place over time. Cc: Michael S. Tsirkin <mst@redhat.com> Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro> Cc: Richard Henderson <rth@twiddle.net> Signed-off-by: David Gibson <david@gibson.dropbear.id.au> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2012-06-27 08:50:38 +04:00
}
void qemu_sglist_init(QEMUSGList *qsg, DeviceState *dev, int alloc_hint,
AddressSpace *as)
{
qsg->sg = g_new(ScatterGatherEntry, alloc_hint);
qsg->nsg = 0;
qsg->nalloc = alloc_hint;
qsg->size = 0;
qsg->as = as;
qsg->dev = dev;
object_ref(OBJECT(dev));
}
void qemu_sglist_add(QEMUSGList *qsg, dma_addr_t base, dma_addr_t len)
{
if (qsg->nsg == qsg->nalloc) {
qsg->nalloc = 2 * qsg->nalloc + 1;
qsg->sg = g_renew(ScatterGatherEntry, qsg->sg, qsg->nalloc);
}
qsg->sg[qsg->nsg].base = base;
qsg->sg[qsg->nsg].len = len;
qsg->size += len;
++qsg->nsg;
}
void qemu_sglist_destroy(QEMUSGList *qsg)
{
object_unref(OBJECT(qsg->dev));
g_free(qsg->sg);
memset(qsg, 0, sizeof(*qsg));
}
typedef struct {
BlockAIOCB common;
AioContext *ctx;
BlockAIOCB *acb;
QEMUSGList *sg;
uint32_t align;
uint64_t offset;
DMADirection dir;
int sg_cur_index;
dma_addr_t sg_cur_byte;
QEMUIOVector iov;
QEMUBH *bh;
DMAIOFunc *io_func;
void *io_func_opaque;
} DMAAIOCB;
static void dma_blk_cb(void *opaque, int ret);
static void reschedule_dma(void *opaque)
{
DMAAIOCB *dbs = (DMAAIOCB *)opaque;
assert(!dbs->acb && dbs->bh);
qemu_bh_delete(dbs->bh);
dbs->bh = NULL;
dma_blk_cb(dbs, 0);
}
static void dma_blk_unmap(DMAAIOCB *dbs)
{
int i;
for (i = 0; i < dbs->iov.niov; ++i) {
dma_memory_unmap(dbs->sg->as, dbs->iov.iov[i].iov_base,
dbs->iov.iov[i].iov_len, dbs->dir,
dbs->iov.iov[i].iov_len);
}
qemu_iovec_reset(&dbs->iov);
}
static void dma_complete(DMAAIOCB *dbs, int ret)
{
trace_dma_complete(dbs, ret, dbs->common.cb);
assert(!dbs->acb && !dbs->bh);
dma_blk_unmap(dbs);
if (dbs->common.cb) {
dbs->common.cb(dbs->common.opaque, ret);
}
qemu_iovec_destroy(&dbs->iov);
qemu_aio_unref(dbs);
}
static void dma_blk_cb(void *opaque, int ret)
{
DMAAIOCB *dbs = (DMAAIOCB *)opaque;
AioContext *ctx = dbs->ctx;
dma_addr_t cur_addr, cur_len;
void *mem;
trace_dma_blk_cb(dbs, ret);
/* DMAAIOCB is not thread-safe and must be accessed only from dbs->ctx */
assert(ctx == qemu_get_current_aio_context());
dbs->acb = NULL;
dbs->offset += dbs->iov.size;
if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) {
dma_complete(dbs, ret);
return;
}
dma_blk_unmap(dbs);
while (dbs->sg_cur_index < dbs->sg->nsg) {
cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
mem = dma_memory_map(dbs->sg->as, cur_addr, &cur_len, dbs->dir,
MEMTXATTRS_UNSPECIFIED);
/*
* Make reads deterministic in icount mode. Windows sometimes issues
* disk read requests with overlapping SGs. It leads
* to non-determinism, because resulting buffer contents may be mixed
* from several sectors. This code splits all SGs into several
* groups. SGs in every group do not overlap.
*/
if (mem && icount_enabled() && dbs->dir == DMA_DIRECTION_FROM_DEVICE) {
int i;
for (i = 0 ; i < dbs->iov.niov ; ++i) {
if (ranges_overlap((intptr_t)dbs->iov.iov[i].iov_base,
dbs->iov.iov[i].iov_len, (intptr_t)mem,
cur_len)) {
dma_memory_unmap(dbs->sg->as, mem, cur_len,
dbs->dir, cur_len);
mem = NULL;
break;
}
}
}
if (!mem)
break;
qemu_iovec_add(&dbs->iov, mem, cur_len);
dbs->sg_cur_byte += cur_len;
if (dbs->sg_cur_byte == dbs->sg->sg[dbs->sg_cur_index].len) {
dbs->sg_cur_byte = 0;
++dbs->sg_cur_index;
}
}
if (dbs->iov.size == 0) {
trace_dma_map_wait(dbs);
dbs->bh = aio_bh_new(ctx, reschedule_dma, dbs);
cpu_register_map_client(dbs->bh);
return;
}
if (!QEMU_IS_ALIGNED(dbs->iov.size, dbs->align)) {
qemu_iovec_discard_back(&dbs->iov,
QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
}
dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
dma_blk_cb, dbs, dbs->io_func_opaque);
assert(dbs->acb);
}
static void dma_aio_cancel(BlockAIOCB *acb)
{
DMAAIOCB *dbs = container_of(acb, DMAAIOCB, common);
trace_dma_aio_cancel(dbs);
assert(!(dbs->acb && dbs->bh));
if (dbs->acb) {
/* This will invoke dma_blk_cb. */
blk_aio_cancel_async(dbs->acb);
return;
}
if (dbs->bh) {
cpu_unregister_map_client(dbs->bh);
qemu_bh_delete(dbs->bh);
dbs->bh = NULL;
}
if (dbs->common.cb) {
dbs->common.cb(dbs->common.opaque, -ECANCELED);
}
}
static const AIOCBInfo dma_aiocb_info = {
.aiocb_size = sizeof(DMAAIOCB),
.cancel_async = dma_aio_cancel,
};
BlockAIOCB *dma_blk_io(AioContext *ctx,
QEMUSGList *sg, uint64_t offset, uint32_t align,
DMAIOFunc *io_func, void *io_func_opaque,
BlockCompletionFunc *cb,
void *opaque, DMADirection dir)
{
DMAAIOCB *dbs = qemu_aio_get(&dma_aiocb_info, NULL, cb, opaque);
trace_dma_blk_io(dbs, io_func_opaque, offset, (dir == DMA_DIRECTION_TO_DEVICE));
dbs->acb = NULL;
dbs->sg = sg;
dbs->ctx = ctx;
dbs->offset = offset;
dbs->align = align;
dbs->sg_cur_index = 0;
dbs->sg_cur_byte = 0;
dbs->dir = dir;
dbs->io_func = io_func;
dbs->io_func_opaque = io_func_opaque;
dbs->bh = NULL;
qemu_iovec_init(&dbs->iov, sg->nsg);
dma_blk_cb(dbs, 0);
return &dbs->common;
}
static
BlockAIOCB *dma_blk_read_io_func(int64_t offset, QEMUIOVector *iov,
BlockCompletionFunc *cb, void *cb_opaque,
void *opaque)
{
BlockBackend *blk = opaque;
return blk_aio_preadv(blk, offset, iov, 0, cb, cb_opaque);
}
BlockAIOCB *dma_blk_read(BlockBackend *blk,
QEMUSGList *sg, uint64_t offset, uint32_t align,
void (*cb)(void *opaque, int ret), void *opaque)
{
return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
dma_blk_read_io_func, blk, cb, opaque,
DMA_DIRECTION_FROM_DEVICE);
}
static
BlockAIOCB *dma_blk_write_io_func(int64_t offset, QEMUIOVector *iov,
BlockCompletionFunc *cb, void *cb_opaque,
void *opaque)
{
BlockBackend *blk = opaque;
return blk_aio_pwritev(blk, offset, iov, 0, cb, cb_opaque);
}
BlockAIOCB *dma_blk_write(BlockBackend *blk,
QEMUSGList *sg, uint64_t offset, uint32_t align,
void (*cb)(void *opaque, int ret), void *opaque)
{
return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
dma_blk_write_io_func, blk, cb, opaque,
DMA_DIRECTION_TO_DEVICE);
}
static MemTxResult dma_buf_rw(void *buf, dma_addr_t len, dma_addr_t *residual,
QEMUSGList *sg, DMADirection dir,
MemTxAttrs attrs)
{
uint8_t *ptr = buf;
dma_addr_t xresidual;
int sg_cur_index;
MemTxResult res = MEMTX_OK;
xresidual = sg->size;
sg_cur_index = 0;
len = MIN(len, xresidual);
while (len > 0) {
ScatterGatherEntry entry = sg->sg[sg_cur_index++];
dma_addr_t xfer = MIN(len, entry.len);
res |= dma_memory_rw(sg->as, entry.base, ptr, xfer, dir, attrs);
ptr += xfer;
len -= xfer;
xresidual -= xfer;
}
if (residual) {
*residual = xresidual;
}
return res;
}
MemTxResult dma_buf_read(void *ptr, dma_addr_t len, dma_addr_t *residual,
QEMUSGList *sg, MemTxAttrs attrs)
{
return dma_buf_rw(ptr, len, residual, sg, DMA_DIRECTION_FROM_DEVICE, attrs);
}
MemTxResult dma_buf_write(void *ptr, dma_addr_t len, dma_addr_t *residual,
QEMUSGList *sg, MemTxAttrs attrs)
{
return dma_buf_rw(ptr, len, residual, sg, DMA_DIRECTION_TO_DEVICE, attrs);
}
void dma_acct_start(BlockBackend *blk, BlockAcctCookie *cookie,
QEMUSGList *sg, enum BlockAcctType type)
{
block_acct_start(blk_get_stats(blk), cookie, sg->size, type);
}
uint64_t dma_aligned_pow2_mask(uint64_t start, uint64_t end, int max_addr_bits)
{
uint64_t max_mask = UINT64_MAX, addr_mask = end - start;
uint64_t alignment_mask, size_mask;
if (max_addr_bits != 64) {
max_mask = (1ULL << max_addr_bits) - 1;
}
alignment_mask = start ? (start & -start) - 1 : max_mask;
alignment_mask = MIN(alignment_mask, max_mask);
size_mask = MIN(addr_mask, max_mask);
if (alignment_mask <= size_mask) {
/* Increase the alignment of start */
return alignment_mask;
} else {
/* Find the largest page mask from size */
if (addr_mask == UINT64_MAX) {
return UINT64_MAX;
}
return (1ULL << (63 - clz64(addr_mask + 1))) - 1;
}
}