Module Name:    src
Committed By:   jdolecek
Date:           Sun Jun 10 17:54:51 UTC 2018

Modified Files:
        src/sys/kern: sys_pipe.c
        src/sys/sys: pipe.h

Log Message:
convert the (still disabled) 'direct write' for pipes to use the
experimental PMAP_DIRECT if available; the direct code paths now survive
longer than the pmap_enter() variant, but still triggers panic during
build.sh tools run; remove some obsolete sysctls

add some XXXs to mark places which need attention to make this more stable

Note: the loan case is now actually significantly slower than the
non-loan case on MP systems, due to synchronous IPIs triggered by
marking the page read-only by uvm_loan(); this is being discussed
in the email thread
https://mail-index.netbsd.org/tech-kern/2018/05/21/msg023441.html

that is basically the same issue due to which loaning was disabled
for sosend()


To generate a diff of this commit:
cvs rdiff -u -r1.145 -r1.146 src/sys/kern/sys_pipe.c
cvs rdiff -u -r1.34 -r1.35 src/sys/sys/pipe.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/kern/sys_pipe.c
diff -u src/sys/kern/sys_pipe.c:1.145 src/sys/kern/sys_pipe.c:1.146
--- src/sys/kern/sys_pipe.c:1.145	Sat May 19 11:39:37 2018
+++ src/sys/kern/sys_pipe.c	Sun Jun 10 17:54:51 2018
@@ -1,4 +1,4 @@
-/*	$NetBSD: sys_pipe.c,v 1.145 2018/05/19 11:39:37 jdolecek Exp $	*/
+/*	$NetBSD: sys_pipe.c,v 1.146 2018/06/10 17:54:51 jdolecek Exp $	*/
 
 /*-
  * Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc.
@@ -68,7 +68,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.145 2018/05/19 11:39:37 jdolecek Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.146 2018/06/10 17:54:51 jdolecek Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -102,6 +102,12 @@ __KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v
 
 #ifndef PIPE_NODIRECT
 #include <uvm/uvm.h>
+
+#if !defined(PMAP_DIRECT)
+#  define PIPE_NODIRECT		/* Direct map interface not available */
+#endif
+
+bool pipe_direct = true;
 #endif
 
 static int	pipe_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
@@ -136,20 +142,6 @@ static const struct fileops pipeops = {
 #define	MAXPIPESIZE	(2 * PIPE_SIZE / 3)
 
 /*
- * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
- * is there so that on large systems, we don't exhaust it.
- */
-#define	MAXPIPEKVA	(8 * 1024 * 1024)
-static u_int	maxpipekva = MAXPIPEKVA;
-
-/*
- * Limit for direct transfers, we cannot, of course limit
- * the amount of kva for pipes in general though.
- */
-#define	LIMITPIPEKVA	(16 * 1024 * 1024)
-static u_int	limitpipekva = LIMITPIPEKVA;
-
-/*
  * Limit the number of "big" pipes
  */
 #define	LIMITBIGPIPES	32
@@ -177,6 +169,7 @@ static void	pipe_dtor(void *, void *);
 #ifndef PIPE_NODIRECT
 static int	pipe_loan_alloc(struct pipe *, int);
 static void	pipe_loan_free(struct pipe *);
+static int	pipe_direct_process_read(void *, size_t, void *);
 #endif /* PIPE_NODIRECT */
 
 static pool_cache_t	pipe_wr_cache;
@@ -446,6 +439,16 @@ pipeselwakeup(struct pipe *selp, struct 
 	fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp);
 }
 
+#ifndef PIPE_NODIRECT
+static int
+pipe_direct_process_read(void *va, size_t len, void *arg)
+{
+	struct uio *uio = (struct uio *)arg;
+
+	return uiomove(va, len, uio);
+}
+#endif
+
 static int
 pipe_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
     int flags)
@@ -507,30 +510,39 @@ again:
 #ifndef PIPE_NODIRECT
 		if ((rpipe->pipe_state & PIPE_DIRECTR) != 0) {
 			struct pipemapping * const rmap = &rpipe->pipe_map;
+			voff_t pgoff;
+			u_int pgst, npages;
+
 			/*
 			 * Direct copy, bypassing a kernel buffer.
 			 */
-			void *va;
-
 			KASSERT(rpipe->pipe_state & PIPE_DIRECTW);
 
-			size = rmap->cnt;
-			if (size > uio->uio_resid)
-				size = uio->uio_resid;
+			size = MIN(rmap->cnt, uio->uio_resid);
+
+			if (size > 0) {
+				KASSERT(size > 0);
+				mutex_exit(lock);
+
+				pgst = rmap->pos >> PAGE_SHIFT;
+				pgoff = rmap->pos & PAGE_MASK;
+				npages = (size + pgoff + PAGE_SIZE - 1) >> PAGE_SHIFT;
+				KASSERTMSG(npages > 0 && (pgst + npages) <= rmap->npages, "npages %u pgst %u rmap->npages %u", npages, pgst, rmap->npages);
+				
+				error = uvm_direct_process(&rmap->pgs[pgst], npages,
+				    pgoff, size, pipe_direct_process_read, uio);
+				mutex_enter(lock);
+
+				nread += size;
+				rmap->pos += size;
+				rmap->cnt -= size;
+			}
 
-			va = (char *)rmap->kva + rmap->pos;
-			mutex_exit(lock);
-			error = uiomove(va, size, uio);
-			mutex_enter(lock);
-			if (error)
-				break;
-			nread += size;
-			rmap->pos += size;
-			rmap->cnt -= size;
 			if (rmap->cnt == 0) {
 				rpipe->pipe_state &= ~PIPE_DIRECTR;
 				cv_broadcast(&rpipe->pipe_wcv);
 			}
+
 			continue;
 		}
 #endif
@@ -630,18 +642,20 @@ static int
 pipe_loan_alloc(struct pipe *wpipe, int npages)
 {
 	struct pipemapping * const wmap = &wpipe->pipe_map;
-	const vsize_t len = ptoa(npages);
 
-	atomic_add_int(&amountpipekva, len);
-	wmap->kva = uvm_km_alloc(kernel_map, len, 0,
-	    UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA);
-	if (wmap->kva == 0) {
-		atomic_add_int(&amountpipekva, -len);
-		return (ENOMEM);
+	KASSERT(wmap->npages == 0);
+
+	if (npages > wmap->maxpages) {
+		pipe_loan_free(wpipe);
+
+		wmap->pgs = kmem_alloc(npages * sizeof(struct vm_page *), KM_NOSLEEP);
+		if (wmap->pgs == NULL)
+			return ENOMEM;
+		wmap->maxpages = npages;
 	}
 
 	wmap->npages = npages;
-	wmap->pgs = kmem_alloc(npages * sizeof(struct vm_page *), KM_SLEEP);
+
 	return (0);
 }
 
@@ -652,18 +666,16 @@ static void
 pipe_loan_free(struct pipe *wpipe)
 {
 	struct pipemapping * const wmap = &wpipe->pipe_map;
-	const vsize_t len = ptoa(wmap->npages);
 
-	uvm_km_free(kernel_map, wmap->kva, len, UVM_KMF_VAONLY);
-	wmap->kva = 0;
-	atomic_add_int(&amountpipekva, -len);
-	kmem_free(wmap->pgs, wmap->npages * sizeof(struct vm_page *));
-	wmap->pgs = NULL;
-#if 0
+	if (wmap->maxpages > 0) {
+		kmem_free(wmap->pgs, wmap->maxpages * sizeof(struct vm_page *));
+		wmap->pgs = NULL;
+		wmap->maxpages = 0;
+	}
+
 	wmap->npages = 0;
 	wmap->pos = 0;
 	wmap->cnt = 0;
-#endif
 }
 
 /*
@@ -681,20 +693,18 @@ pipe_direct_write(file_t *fp, struct pip
 {
 	struct pipemapping * const wmap = &wpipe->pipe_map;
 	kmutex_t * const lock = wpipe->pipe_lock;
-	struct vm_page **pgs;
 	vaddr_t bbase, base, bend;
 	vsize_t blen, bcnt;
 	int error, npages;
 	voff_t bpos;
-	u_int starting_color;
 
-	KASSERT(mutex_owned(wpipe->pipe_lock));
+	KASSERT(mutex_owned(lock));
 	KASSERT(wmap->cnt == 0);
 
 	mutex_exit(lock);
 
 	/*
-	 * Handle first PIPE_CHUNK_SIZE bytes of buffer. Deal with buffers
+	 * Handle first PIPE_DIRECT_CHUNK bytes of buffer. Deal with buffers
 	 * not aligned to PAGE_SIZE.
 	 */
 	bbase = (vaddr_t)uio->uio_iov->iov_base;
@@ -711,43 +721,28 @@ pipe_direct_write(file_t *fp, struct pip
 		bcnt = uio->uio_iov->iov_len;
 	}
 	npages = atop(blen);
-	starting_color = atop(base) & uvmexp.colormask;
 
-	/*
-	 * Free the old kva if we need more pages than we have
-	 * allocated.
-	 */
-	if (wmap->kva != 0 && starting_color + npages > wmap->npages)
-		pipe_loan_free(wpipe);
+	KASSERT((wpipe->pipe_state & (PIPE_DIRECTW | PIPE_DIRECTR)) == 0);
+	KASSERT(wmap->npages == 0);
 
-	/* Allocate new kva. */
-	if (wmap->kva == 0) {
-		error = pipe_loan_alloc(wpipe, starting_color + npages);
-		if (error) {
-			mutex_enter(lock);
-			return (error);
-		}
+	/* Make sure page array is big enough */
+	error = pipe_loan_alloc(wpipe, npages);
+	if (error) {
+		mutex_enter(lock);
+		return (error);
 	}
 
 	/* Loan the write buffer memory from writer process */
-	pgs = wmap->pgs + starting_color;
 	error = uvm_loan(&uio->uio_vmspace->vm_map, base, blen,
-			 pgs, UVM_LOAN_TOPAGE);
+			 wmap->pgs, UVM_LOAN_TOPAGE);
 	if (error) {
 		pipe_loan_free(wpipe);
 		mutex_enter(lock);
 		return (ENOMEM); /* so that caller fallback to ordinary write */
 	}
 
- 	/* Enter the loaned pages to kva */
- 	vaddr_t kva = wpipe->pipe_map.kva;
- 	for (int j = 0; j < npages; j++, kva += PAGE_SIZE) {
- 		pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ, 0);
- 	}
- 	pmap_update(pmap_kernel());
-
 	/* Now we can put the pipe in direct write mode */
-	wmap->pos = bpos + ptoa(starting_color);
+	wmap->pos = bpos;
 	wmap->cnt = bcnt;
 
 	/*
@@ -783,17 +778,13 @@ pipe_direct_write(file_t *fp, struct pip
 
 	/* Acquire the pipe lock and cleanup */
 	(void)pipelock(wpipe, false);
-	mutex_exit(lock);
-
-	if (pgs != NULL) {
-		pmap_kremove(wpipe->pipe_map.kva, blen);
-		pmap_update(pmap_kernel());
-		uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE);
-	}
-	if (error || amountpipekva > maxpipekva)
-		pipe_loan_free(wpipe);
 
+	mutex_exit(lock);
+	/* XXX what happens if the writer process exits without waiting for reader?
+	 * XXX FreeBSD does a clone in this case */
+	uvm_unloan(wmap->pgs, npages, UVM_LOAN_TOPAGE);
 	mutex_enter(lock);
+
 	if (error) {
 		pipeselwakeup(wpipe, wpipe, POLL_ERR);
 
@@ -914,7 +905,7 @@ pipe_write(file_t *fp, off_t *offset, st
 		 */
 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
 		    (fp->f_flag & FNONBLOCK) == 0 &&
-		    (wpipe->pipe_map.kva || (amountpipekva < limitpipekva))) {
+		    pipe_direct) {
 			error = pipe_direct_write(fp, wpipe, uio);
 
 			/*
@@ -1271,12 +1262,8 @@ pipe_free_kmem(struct pipe *pipe)
 		pipe->pipe_buffer.buffer = NULL;
 	}
 #ifndef PIPE_NODIRECT
-	if (pipe->pipe_map.kva != 0) {
+	if (pipe->pipe_map.npages > 0)
 		pipe_loan_free(pipe);
-		pipe->pipe_map.cnt = 0;
-		pipe->pipe_map.pos = 0;
-		pipe->pipe_map.npages = 0;
-	}
 #endif /* !PIPE_NODIRECT */
 }
 
@@ -1509,19 +1496,6 @@ SYSCTL_SETUP(sysctl_kern_pipe_setup, "sy
 
 	sysctl_createv(clog, 0, NULL, NULL,
 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
-		       CTLTYPE_INT, "maxkvasz",
-		       SYSCTL_DESCR("Maximum amount of kernel memory to be "
-				    "used for pipes"),
-		       NULL, 0, &maxpipekva, 0,
-		       CTL_KERN, KERN_PIPE, KERN_PIPE_MAXKVASZ, CTL_EOL);
-	sysctl_createv(clog, 0, NULL, NULL,
-		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
-		       CTLTYPE_INT, "maxloankvasz",
-		       SYSCTL_DESCR("Limit for direct transfers via page loan"),
-		       NULL, 0, &limitpipekva, 0,
-		       CTL_KERN, KERN_PIPE, KERN_PIPE_LIMITKVA, CTL_EOL);
-	sysctl_createv(clog, 0, NULL, NULL,
-		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 		       CTLTYPE_INT, "maxbigpipes",
 		       SYSCTL_DESCR("Maximum number of \"big\" pipes"),
 		       NULL, 0, &maxbigpipes, 0,

Index: src/sys/sys/pipe.h
diff -u src/sys/sys/pipe.h:1.34 src/sys/sys/pipe.h:1.35
--- src/sys/sys/pipe.h:1.34	Sat May 19 11:39:37 2018
+++ src/sys/sys/pipe.h	Sun Jun 10 17:54:51 2018
@@ -1,4 +1,4 @@
-/* $NetBSD: pipe.h,v 1.34 2018/05/19 11:39:37 jdolecek Exp $ */
+/* $NetBSD: pipe.h,v 1.35 2018/06/10 17:54:51 jdolecek Exp $ */
 
 /*
  * Copyright (c) 1996 John S. Dyson
@@ -43,10 +43,9 @@
 #endif
 
 /*
- * Maximum size of kva for direct write transfer. If the amount
+ * Maximum size of transfer for direct write transfer. If the amount
  * of data in buffer is larger, it would be transferred in chunks of this
- * size. This kva memory is freed after use if amount of pipe kva memory
- * is bigger than limitpipekva.
+ * size.
  */
 #ifndef PIPE_DIRECT_CHUNK
 #define PIPE_DIRECT_CHUNK	(1*1024*1024)
@@ -77,10 +76,10 @@ struct pipebuf {
  * Information to support direct transfers between processes for pipes.
  */
 struct pipemapping {
-	vaddr_t		kva;		/* kernel virtual address */
 	vsize_t		cnt;		/* number of chars in buffer */
 	voff_t		pos;		/* current position within page */
-	int		npages;		/* how many pages allocated */
+	u_int		npages;		/* how many pages available */
+	u_int		maxpages;	/* how many pages allocated */
 	struct vm_page	**pgs;		/* pointers to the pages */
 };
 
@@ -124,8 +123,8 @@ struct pipe {
 /*
  * KERN_PIPE subtypes
  */
-#define	KERN_PIPE_MAXKVASZ		1	/* maximum kva size */
-#define	KERN_PIPE_LIMITKVA		2	/* */
+#define	KERN_PIPE_MAXKVASZ		1	/* maximum kva size (obsolete) */
+#define	KERN_PIPE_LIMITKVA		2	/* limit kva for laons (obsolete) */
 #define	KERN_PIPE_MAXBIGPIPES		3	/* maximum # of "big" pipes */
 #define	KERN_PIPE_NBIGPIPES		4	/* current number of "big" p. */
 #define	KERN_PIPE_KVASIZE		5	/* current pipe kva size */

Reply via email to