convert the (still disabled) 'direct write' for pipes to use the

experimental PMAP_DIRECT if available; the direct code paths now survive
longer than the pmap_enter() variant, but still triggers panic during
build.sh tools run; remove some obsolete sysctls

add some XXXs to mark places which need attention to make this more stable

Note: the loan case is now actually significantly slower than the
non-loan case on MP systems, due to synchronous IPIs triggered by
marking the page read-only by uvm_loan(); this is being discussed
in the email thread
https://mail-index.netbsd.org/tech-kern/2018/05/21/msg023441.html

that is basically the same issue due to which loaning was disabled
for sosend()
This commit is contained in:
jdolecek 2018-06-10 17:54:51 +00:00
parent b8cc3839ab
commit 3d8c9e20d7
2 changed files with 83 additions and 110 deletions

View File

@ -1,4 +1,4 @@
/* $NetBSD: sys_pipe.c,v 1.145 2018/05/19 11:39:37 jdolecek Exp $ */ /* $NetBSD: sys_pipe.c,v 1.146 2018/06/10 17:54:51 jdolecek Exp $ */
/*- /*-
* Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc. * Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc.
@ -68,7 +68,7 @@
*/ */
#include <sys/cdefs.h> #include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.145 2018/05/19 11:39:37 jdolecek Exp $"); __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.146 2018/06/10 17:54:51 jdolecek Exp $");
#include <sys/param.h> #include <sys/param.h>
#include <sys/systm.h> #include <sys/systm.h>
@ -102,6 +102,12 @@ __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.145 2018/05/19 11:39:37 jdolecek Exp
#ifndef PIPE_NODIRECT #ifndef PIPE_NODIRECT
#include <uvm/uvm.h> #include <uvm/uvm.h>
#if !defined(PMAP_DIRECT)
# define PIPE_NODIRECT /* Direct map interface not available */
#endif
bool pipe_direct = true;
#endif #endif
static int pipe_read(file_t *, off_t *, struct uio *, kauth_cred_t, int); static int pipe_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
@ -135,20 +141,6 @@ static const struct fileops pipeops = {
#define MINPIPESIZE (PIPE_SIZE / 3) #define MINPIPESIZE (PIPE_SIZE / 3)
#define MAXPIPESIZE (2 * PIPE_SIZE / 3) #define MAXPIPESIZE (2 * PIPE_SIZE / 3)
/*
* Maximum amount of kva for pipes -- this is kind-of a soft limit, but
* is there so that on large systems, we don't exhaust it.
*/
#define MAXPIPEKVA (8 * 1024 * 1024)
static u_int maxpipekva = MAXPIPEKVA;
/*
* Limit for direct transfers, we cannot, of course limit
* the amount of kva for pipes in general though.
*/
#define LIMITPIPEKVA (16 * 1024 * 1024)
static u_int limitpipekva = LIMITPIPEKVA;
/* /*
* Limit the number of "big" pipes * Limit the number of "big" pipes
*/ */
@ -177,6 +169,7 @@ static void pipe_dtor(void *, void *);
#ifndef PIPE_NODIRECT #ifndef PIPE_NODIRECT
static int pipe_loan_alloc(struct pipe *, int); static int pipe_loan_alloc(struct pipe *, int);
static void pipe_loan_free(struct pipe *); static void pipe_loan_free(struct pipe *);
static int pipe_direct_process_read(void *, size_t, void *);
#endif /* PIPE_NODIRECT */ #endif /* PIPE_NODIRECT */
static pool_cache_t pipe_wr_cache; static pool_cache_t pipe_wr_cache;
@ -446,6 +439,16 @@ pipeselwakeup(struct pipe *selp, struct pipe *sigp, int code)
fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp); fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp);
} }
#ifndef PIPE_NODIRECT
static int
pipe_direct_process_read(void *va, size_t len, void *arg)
{
struct uio *uio = (struct uio *)arg;
return uiomove(va, len, uio);
}
#endif
static int static int
pipe_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred, pipe_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
int flags) int flags)
@ -507,30 +510,39 @@ again:
#ifndef PIPE_NODIRECT #ifndef PIPE_NODIRECT
if ((rpipe->pipe_state & PIPE_DIRECTR) != 0) { if ((rpipe->pipe_state & PIPE_DIRECTR) != 0) {
struct pipemapping * const rmap = &rpipe->pipe_map; struct pipemapping * const rmap = &rpipe->pipe_map;
voff_t pgoff;
u_int pgst, npages;
/* /*
* Direct copy, bypassing a kernel buffer. * Direct copy, bypassing a kernel buffer.
*/ */
void *va;
KASSERT(rpipe->pipe_state & PIPE_DIRECTW); KASSERT(rpipe->pipe_state & PIPE_DIRECTW);
size = rmap->cnt; size = MIN(rmap->cnt, uio->uio_resid);
if (size > uio->uio_resid)
size = uio->uio_resid; if (size > 0) {
KASSERT(size > 0);
mutex_exit(lock);
pgst = rmap->pos >> PAGE_SHIFT;
pgoff = rmap->pos & PAGE_MASK;
npages = (size + pgoff + PAGE_SIZE - 1) >> PAGE_SHIFT;
KASSERTMSG(npages > 0 && (pgst + npages) <= rmap->npages, "npages %u pgst %u rmap->npages %u", npages, pgst, rmap->npages);
error = uvm_direct_process(&rmap->pgs[pgst], npages,
pgoff, size, pipe_direct_process_read, uio);
mutex_enter(lock);
nread += size;
rmap->pos += size;
rmap->cnt -= size;
}
va = (char *)rmap->kva + rmap->pos;
mutex_exit(lock);
error = uiomove(va, size, uio);
mutex_enter(lock);
if (error)
break;
nread += size;
rmap->pos += size;
rmap->cnt -= size;
if (rmap->cnt == 0) { if (rmap->cnt == 0) {
rpipe->pipe_state &= ~PIPE_DIRECTR; rpipe->pipe_state &= ~PIPE_DIRECTR;
cv_broadcast(&rpipe->pipe_wcv); cv_broadcast(&rpipe->pipe_wcv);
} }
continue; continue;
} }
#endif #endif
@ -630,18 +642,20 @@ static int
pipe_loan_alloc(struct pipe *wpipe, int npages) pipe_loan_alloc(struct pipe *wpipe, int npages)
{ {
struct pipemapping * const wmap = &wpipe->pipe_map; struct pipemapping * const wmap = &wpipe->pipe_map;
const vsize_t len = ptoa(npages);
atomic_add_int(&amountpipekva, len); KASSERT(wmap->npages == 0);
wmap->kva = uvm_km_alloc(kernel_map, len, 0,
UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA); if (npages > wmap->maxpages) {
if (wmap->kva == 0) { pipe_loan_free(wpipe);
atomic_add_int(&amountpipekva, -len);
return (ENOMEM); wmap->pgs = kmem_alloc(npages * sizeof(struct vm_page *), KM_NOSLEEP);
if (wmap->pgs == NULL)
return ENOMEM;
wmap->maxpages = npages;
} }
wmap->npages = npages; wmap->npages = npages;
wmap->pgs = kmem_alloc(npages * sizeof(struct vm_page *), KM_SLEEP);
return (0); return (0);
} }
@ -652,18 +666,16 @@ static void
pipe_loan_free(struct pipe *wpipe) pipe_loan_free(struct pipe *wpipe)
{ {
struct pipemapping * const wmap = &wpipe->pipe_map; struct pipemapping * const wmap = &wpipe->pipe_map;
const vsize_t len = ptoa(wmap->npages);
uvm_km_free(kernel_map, wmap->kva, len, UVM_KMF_VAONLY); if (wmap->maxpages > 0) {
wmap->kva = 0; kmem_free(wmap->pgs, wmap->maxpages * sizeof(struct vm_page *));
atomic_add_int(&amountpipekva, -len); wmap->pgs = NULL;
kmem_free(wmap->pgs, wmap->npages * sizeof(struct vm_page *)); wmap->maxpages = 0;
wmap->pgs = NULL; }
#if 0
wmap->npages = 0; wmap->npages = 0;
wmap->pos = 0; wmap->pos = 0;
wmap->cnt = 0; wmap->cnt = 0;
#endif
} }
/* /*
@ -681,20 +693,18 @@ pipe_direct_write(file_t *fp, struct pipe *wpipe, struct uio *uio)
{ {
struct pipemapping * const wmap = &wpipe->pipe_map; struct pipemapping * const wmap = &wpipe->pipe_map;
kmutex_t * const lock = wpipe->pipe_lock; kmutex_t * const lock = wpipe->pipe_lock;
struct vm_page **pgs;
vaddr_t bbase, base, bend; vaddr_t bbase, base, bend;
vsize_t blen, bcnt; vsize_t blen, bcnt;
int error, npages; int error, npages;
voff_t bpos; voff_t bpos;
u_int starting_color;
KASSERT(mutex_owned(wpipe->pipe_lock)); KASSERT(mutex_owned(lock));
KASSERT(wmap->cnt == 0); KASSERT(wmap->cnt == 0);
mutex_exit(lock); mutex_exit(lock);
/* /*
* Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers * Handle first PIPE_DIRECT_CHUNK bytes of buffer. Deal with buffers
* not aligned to PAGE_SIZE. * not aligned to PAGE_SIZE.
*/ */
bbase = (vaddr_t)uio->uio_iov->iov_base; bbase = (vaddr_t)uio->uio_iov->iov_base;
@ -711,43 +721,28 @@ pipe_direct_write(file_t *fp, struct pipe *wpipe, struct uio *uio)
bcnt = uio->uio_iov->iov_len; bcnt = uio->uio_iov->iov_len;
} }
npages = atop(blen); npages = atop(blen);
starting_color = atop(base) & uvmexp.colormask;
/* KASSERT((wpipe->pipe_state & (PIPE_DIRECTW | PIPE_DIRECTR)) == 0);
* Free the old kva if we need more pages than we have KASSERT(wmap->npages == 0);
* allocated.
*/
if (wmap->kva != 0 && starting_color + npages > wmap->npages)
pipe_loan_free(wpipe);
/* Allocate new kva. */ /* Make sure page array is big enough */
if (wmap->kva == 0) { error = pipe_loan_alloc(wpipe, npages);
error = pipe_loan_alloc(wpipe, starting_color + npages); if (error) {
if (error) { mutex_enter(lock);
mutex_enter(lock); return (error);
return (error);
}
} }
/* Loan the write buffer memory from writer process */ /* Loan the write buffer memory from writer process */
pgs = wmap->pgs + starting_color;
error = uvm_loan(&uio->uio_vmspace->vm_map, base, blen, error = uvm_loan(&uio->uio_vmspace->vm_map, base, blen,
pgs, UVM_LOAN_TOPAGE); wmap->pgs, UVM_LOAN_TOPAGE);
if (error) { if (error) {
pipe_loan_free(wpipe); pipe_loan_free(wpipe);
mutex_enter(lock); mutex_enter(lock);
return (ENOMEM); /* so that caller fallback to ordinary write */ return (ENOMEM); /* so that caller fallback to ordinary write */
} }
/* Enter the loaned pages to kva */
vaddr_t kva = wpipe->pipe_map.kva;
for (int j = 0; j < npages; j++, kva += PAGE_SIZE) {
pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ, 0);
}
pmap_update(pmap_kernel());
/* Now we can put the pipe in direct write mode */ /* Now we can put the pipe in direct write mode */
wmap->pos = bpos + ptoa(starting_color); wmap->pos = bpos;
wmap->cnt = bcnt; wmap->cnt = bcnt;
/* /*
@ -783,17 +778,13 @@ pipe_direct_write(file_t *fp, struct pipe *wpipe, struct uio *uio)
/* Acquire the pipe lock and cleanup */ /* Acquire the pipe lock and cleanup */
(void)pipelock(wpipe, false); (void)pipelock(wpipe, false);
mutex_exit(lock); mutex_exit(lock);
/* XXX what happens if the writer process exits without waiting for reader?
if (pgs != NULL) { * XXX FreeBSD does a clone in this case */
pmap_kremove(wpipe->pipe_map.kva, blen); uvm_unloan(wmap->pgs, npages, UVM_LOAN_TOPAGE);
pmap_update(pmap_kernel());
uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE);
}
if (error || amountpipekva > maxpipekva)
pipe_loan_free(wpipe);
mutex_enter(lock); mutex_enter(lock);
if (error) { if (error) {
pipeselwakeup(wpipe, wpipe, POLL_ERR); pipeselwakeup(wpipe, wpipe, POLL_ERR);
@ -914,7 +905,7 @@ pipe_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
*/ */
if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
(fp->f_flag & FNONBLOCK) == 0 && (fp->f_flag & FNONBLOCK) == 0 &&
(wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) { pipe_direct) {
error = pipe_direct_write(fp, wpipe, uio); error = pipe_direct_write(fp, wpipe, uio);
/* /*
@ -1271,12 +1262,8 @@ pipe_free_kmem(struct pipe *pipe)
pipe->pipe_buffer.buffer = NULL; pipe->pipe_buffer.buffer = NULL;
} }
#ifndef PIPE_NODIRECT #ifndef PIPE_NODIRECT
if (pipe->pipe_map.kva != 0) { if (pipe->pipe_map.npages > 0)
pipe_loan_free(pipe); pipe_loan_free(pipe);
pipe->pipe_map.cnt = 0;
pipe->pipe_map.pos = 0;
pipe->pipe_map.npages = 0;
}
#endif /* !PIPE_NODIRECT */ #endif /* !PIPE_NODIRECT */
} }
@ -1507,19 +1494,6 @@ SYSCTL_SETUP(sysctl_kern_pipe_setup, "sysctl kern.pipe subtree setup")
NULL, 0, NULL, 0, NULL, 0, NULL, 0,
CTL_KERN, KERN_PIPE, CTL_EOL); CTL_KERN, KERN_PIPE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxkvasz",
SYSCTL_DESCR("Maximum amount of kernel memory to be "
"used for pipes"),
NULL, 0, &maxpipekva, 0,
CTL_KERN, KERN_PIPE, KERN_PIPE_MAXKVASZ, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxloankvasz",
SYSCTL_DESCR("Limit for direct transfers via page loan"),
NULL, 0, &limitpipekva, 0,
CTL_KERN, KERN_PIPE, KERN_PIPE_LIMITKVA, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL, sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxbigpipes", CTLTYPE_INT, "maxbigpipes",

View File

@ -1,4 +1,4 @@
/* $NetBSD: pipe.h,v 1.34 2018/05/19 11:39:37 jdolecek Exp $ */ /* $NetBSD: pipe.h,v 1.35 2018/06/10 17:54:51 jdolecek Exp $ */
/* /*
* Copyright (c) 1996 John S. Dyson * Copyright (c) 1996 John S. Dyson
@ -43,10 +43,9 @@
#endif #endif
/* /*
* Maximum size of kva for direct write transfer. If the amount * Maximum size of transfer for direct write transfer. If the amount
* of data in buffer is larger, it would be transferred in chunks of this * of data in buffer is larger, it would be transferred in chunks of this
* size. This kva memory is freed after use if amount of pipe kva memory * size.
* is bigger than limitpipekva.
*/ */
#ifndef PIPE_DIRECT_CHUNK #ifndef PIPE_DIRECT_CHUNK
#define PIPE_DIRECT_CHUNK (1*1024*1024) #define PIPE_DIRECT_CHUNK (1*1024*1024)
@ -77,10 +76,10 @@ struct pipebuf {
* Information to support direct transfers between processes for pipes. * Information to support direct transfers between processes for pipes.
*/ */
struct pipemapping { struct pipemapping {
vaddr_t kva; /* kernel virtual address */
vsize_t cnt; /* number of chars in buffer */ vsize_t cnt; /* number of chars in buffer */
voff_t pos; /* current position within page */ voff_t pos; /* current position within page */
int npages; /* how many pages allocated */ u_int npages; /* how many pages available */
u_int maxpages; /* how many pages allocated */
struct vm_page **pgs; /* pointers to the pages */ struct vm_page **pgs; /* pointers to the pages */
}; };
@ -124,8 +123,8 @@ struct pipe {
/* /*
* KERN_PIPE subtypes * KERN_PIPE subtypes
*/ */
#define KERN_PIPE_MAXKVASZ 1 /* maximum kva size */ #define KERN_PIPE_MAXKVASZ 1 /* maximum kva size (obsolete) */
#define KERN_PIPE_LIMITKVA 2 /* */ #define KERN_PIPE_LIMITKVA 2 /* limit kva for laons (obsolete) */
#define KERN_PIPE_MAXBIGPIPES 3 /* maximum # of "big" pipes */ #define KERN_PIPE_MAXBIGPIPES 3 /* maximum # of "big" pipes */
#define KERN_PIPE_NBIGPIPES 4 /* current number of "big" p. */ #define KERN_PIPE_NBIGPIPES 4 /* current number of "big" p. */
#define KERN_PIPE_KVASIZE 5 /* current pipe kva size */ #define KERN_PIPE_KVASIZE 5 /* current pipe kva size */