Several changes, developed and tested concurrently:

* Provide POSIX 1003.1b mlockall(2) and munlockall(2) system calls. MCL_CURRENT is presently implemented. MCL_FUTURE is not fully implemented. Also, the same one-unlock-for-every-lock caveat currently applies here as it does to mlock(2). This will be addressed in a future commit. * Provide the mincore(2) system call, with the same semantics as Solaris. * Clean up the error recovery in uvm_map_pageable(). * Fix a bug where a process would hang if attempting to mlock a zero-fill region where none of the pages in that region are resident. [ This fix has been submitted for inclusion in 1.4.1 ]
1999-06-15 23:27:47 +00:00 · 1999-06-15 23:27:47 +00:00 · c5a43ae10c
commit c5a43ae10c
parent 10b0c75443
5 changed files with 463 additions and 35 deletions
--- a/sys/sys/mman.h
+++ b/sys/sys/mman.h
@ -1,4 +1,4 @@
-/*	$NetBSD: mman.h,v 1.21 1999/04/27 20:13:06 cgd Exp $	*/
+/*	$NetBSD: mman.h,v 1.22 1999/06/15 23:27:48 thorpej Exp $	*/

 /*-
 * Copyright (c) 1982, 1986, 1993
@ -91,6 +91,12 @@ typedef	_BSD_SIZE_T_	size_t;
 #define	MS_INVALIDATE	0x02	/* invalidate cached data */
 #define	MS_SYNC		0x04	/* perform synchronous writes */

+/*
+ * Flags to mlockall
+ */
+#define	MCL_CURRENT	0x01	/* lock all pages currently mapped */
+#define	MCL_FUTURE	0x02	/* lock all pages mapped in the future */
+
 #if !defined(_POSIX_C_SOURCE) && !defined(_XOPEN_SOURCE)
 /*
 * Advice to madvise
@ -119,8 +125,11 @@ int	msync __P((void *, size_t, int))	__RENAME(__msync13);
 #endif
 int	mlock __P((const void *, size_t));
 int	munlock __P((const void *, size_t));
+int	mlockall __P((int));
+int	munlockall __P((void));
 #if !defined(_POSIX_C_SOURCE) && !defined(_XOPEN_SOURCE)
 int	madvise __P((void *, size_t, int));
+int	mincore __P((void *, size_t, char *));
 int	minherit __P((void *, size_t, int));
 #endif
 __END_DECLS
--- a/sys/uvm/uvm_extern.h
+++ b/sys/uvm/uvm_extern.h
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_extern.h,v 1.27 1999/05/26 19:16:36 thorpej Exp $	*/
+/*	$NetBSD: uvm_extern.h,v 1.28 1999/06/15 23:27:47 thorpej Exp $	*/

 /*
 *
@ -319,6 +319,7 @@ int			uvm_map __P((vm_map_t, vaddr_t *, vsize_t,
 				struct uvm_object *, vaddr_t, uvm_flag_t));
 int			uvm_map_pageable __P((vm_map_t, vaddr_t, 
 				vaddr_t, boolean_t));
+int			uvm_map_pageable_all __P((vm_map_t, int, vsize_t));
 boolean_t		uvm_map_checkprot __P((vm_map_t, vaddr_t,
 				vaddr_t, vm_prot_t));
 int			uvm_map_protect __P((vm_map_t, vaddr_t, 
--- a/sys/uvm/uvm_map.c
+++ b/sys/uvm/uvm_map.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_map.c,v 1.53 1999/06/07 16:31:42 thorpej Exp $	*/
+/*	$NetBSD: uvm_map.c,v 1.54 1999/06/15 23:27:47 thorpej Exp $	*/

 /* 
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -1990,8 +1990,7 @@ uvm_map_pageable(map, start, end, new_pageable)
 	vaddr_t start, end;
 	boolean_t new_pageable;
 {
-	vm_map_entry_t entry, start_entry;
-	vaddr_t failed = 0;
+	vm_map_entry_t entry, start_entry, failed_entry;
 	int rv;
 	UVMHIST_FUNC("uvm_map_pageable"); UVMHIST_CALLED(maphist);
 	UVMHIST_LOG(maphist,"(map=0x%x,start=0x%x,end=0x%x,new_pageable=0x%x)",
@ -2025,7 +2024,7 @@ uvm_map_pageable(map, start, end, new_pageable)
 	 * handle wiring and unwiring seperately.
 	 */

-	if (new_pageable) {               /* unwire */
+	if (new_pageable) {			/* unwire */

 		UVM_MAP_CLIP_START(map, entry, start);

@ -2060,11 +2059,9 @@ uvm_map_pageable(map, start, end, new_pageable)
 		entry = start_entry;
 		while ((entry != &map->header) && (entry->start < end)) {
 			UVM_MAP_CLIP_END(map, entry, end);
-			
 			entry->wired_count--;
 			if (entry->wired_count == 0)
 				uvm_map_entry_unwire(map, entry);
-			
 			entry = entry->next;
 		}
 		vm_map_unlock(map);
@ -2100,7 +2097,7 @@ uvm_map_pageable(map, start, end, new_pageable)
 	while ((entry != &map->header) && (entry->start < end)) {

 		if (entry->wired_count == 0) {  /* not already wired? */
-			
+
 			/* 
 			 * perform actions of vm_map_lookup that need the
 			 * write lock on the map: create an anonymous map
@ -2108,22 +2105,17 @@ uvm_map_pageable(map, start, end, new_pageable)
 			 * for a zero-fill region.  (XXXCDC: submap case
 			 * ok?)
 			 */
-			
+
 			if (!UVM_ET_ISSUBMAP(entry)) {  /* not submap */
-				/*
-				 * XXXCDC: protection vs. max_protection??
-				 * (wirefault uses max?)
-				 * XXXCDC: used to do it always if
-				 * uvm_obj == NULL (wrong?)
-				 */
-				if ( UVM_ET_ISNEEDSCOPY(entry) && 
-				    (entry->protection & VM_PROT_WRITE) != 0) {
+				if (UVM_ET_ISNEEDSCOPY(entry) && 
+				    ((entry->protection & VM_PROT_WRITE) ||
+				     (entry->object.uvm_obj == NULL))) {
 					amap_copy(map, entry, M_WAITOK, TRUE,
 					    start, end); 
 					/* XXXCDC: wait OK? */
 				}
 			}
-		}     /* wired_count == 0 */
+		} /* wired_count == 0 */
 		UVM_MAP_CLIP_START(map, entry, start);
 		UVM_MAP_CLIP_END(map, entry, end);
 		entry->wired_count++;
@ -2131,8 +2123,10 @@ uvm_map_pageable(map, start, end, new_pageable)
 		/*
 		 * Check for holes 
 		 */
-		if (entry->end < end && (entry->next == &map->header ||
-			     entry->next->start > entry->end)) {
+		if (entry->protection == VM_PROT_NONE ||
+		    (entry->end < end &&
+		     (entry->next == &map->header ||
+		      entry->next->start > entry->end))) {
 			/*
 			 * found one.  amap creation actions do not need to
 			 * be undone, but the wired counts need to be restored. 
@ -2182,16 +2176,24 @@ uvm_map_pageable(map, start, end, new_pageable)
 		 * first drop the wiring count on all the entries
 		 * which haven't actually been wired yet.
 		 */
-		failed = entry->start;
-		while (entry != &map->header && entry->start < end)
+		failed_entry = entry;
+		while (entry != &map->header && entry->start < end) {
 			entry->wired_count--;
+			entry = entry->next;
+		}

 		/*
-		 * now, unlock the map, and unwire all the pages that
-		 * were successfully wired above.
+		 * now, unwire all the entries that were successfully
+		 * wired above.
 		 */
+		entry = start_entry;
+		while (entry != failed_entry) {
+			entry->wired_count--;
+			if (entry->wired_count == 0)
+				uvm_map_entry_unwire(map, entry);
+			entry = entry->next;
+		}
 		vm_map_unlock(map);
-		(void) uvm_map_pageable(map, start, failed, TRUE);
 		UVMHIST_LOG(maphist, "<- done (RV=%d)", rv,0,0,0);
 		return(rv);
 	}
@ -2203,6 +2205,214 @@ uvm_map_pageable(map, start, end, new_pageable)
 	return(KERN_SUCCESS);
 }

+/*
+ * uvm_map_pageable_all: special case of uvm_map_pageable - affects
+ * all mapped regions.
+ *
+ * => map must not be locked.
+ * => if no flags are specified, all regions are unwired.
+ * => XXXJRT: has some of the same problems as uvm_map_pageable() above.
+ */
+
+int
+uvm_map_pageable_all(map, flags, limit)
+	vm_map_t map;
+	int flags;
+	vsize_t limit;
+{
+	vm_map_entry_t entry, failed_entry;
+	vsize_t size;
+	int rv;
+	UVMHIST_FUNC("uvm_map_pageable_all"); UVMHIST_CALLED(maphist);
+	UVMHIST_LOG(maphist,"(map=0x%x,flags=0x%x)", map, flags, 0, 0);
+
+#ifdef DIAGNOSTIC
+	if ((map->flags & VM_MAP_PAGEABLE) == 0)
+		panic("uvm_map_pageable_all: map %p not pageable", map);
+#endif
+
+	vm_map_lock(map);
+
+	/*
+	 * handle wiring and unwiring separately.
+	 */
+
+	if (flags == 0) {			/* unwire */
+		/*
+		 * Decrement the wiring count on the entries.  If they
+		 * reach zero, unwire them.
+		 *
+		 * Note, uvm_fault_unwire() (called via uvm_map_entry_unwire())
+		 * does not lock the map, so we don't have to do anything
+		 * special regarding locking here.
+		 */
+		for (entry = map->header.next; entry != &map->header;
+		     entry = entry->next) {
+			if (entry->wired_count) {
+				if (--entry->wired_count == 0)
+					uvm_map_entry_unwire(map, entry);
+			}
+		}
+		map->flags &= ~VM_MAP_WIREFUTURE;
+		vm_map_unlock(map);
+		UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
+		return (KERN_SUCCESS);
+
+		/*
+		 * end of unwire case!
+		 */
+	}
+
+	if (flags & MCL_FUTURE) {
+		/*
+		 * must wire all future mappings; remember this.
+		 */
+		map->flags |= VM_MAP_WIREFUTURE;
+	}
+
+	if ((flags & MCL_CURRENT) == 0) {
+		/*
+		 * no more work to do!
+		 */
+		UVMHIST_LOG(maphist,"<- done (OK no wire)",0,0,0,0);
+		vm_map_unlock(map);
+		return (KERN_SUCCESS);
+	}
+
+	/*
+	 * wire case: in three passes [XXXCDC: ugly block of code here]
+	 *
+	 * 1: holding the write lock, count all pages mapped by non-wired
+	 *    entries.  if this would cause us to go over our limit, we fail.
+	 *
+	 * 2: still holding the write lock, we create any anonymous maps that
+	 *    need to be created.  then we increment its wiring count.
+	 *
+	 * 3: we downgrade to a read lock, and call uvm_fault_wire to fault
+	 *    in the pages for any newly wired area (wired count is 1).
+	 *
+	 *    downgrading to a read lock for uvm_fault_wire avoids a possible
+	 *    deadlock with another thread that may have faulted on one of
+	 *    the pages to be wired (it would mark the page busy, blocking
+	 *    us, then in turn block on the map lock that we hold).  because
+	 *    of problems in the recursive lock package, we cannot upgrade
+	 *    to a write lock in vm_map_lookup.  thus, any actions that
+	 *    require the write lock must be done beforehand.  because we
+	 *    keep the read lock on the map, the copy-on-write status of the
+	 *    entries we modify here cannot change.
+	 */
+
+	for (size = 0, entry = map->header.next; entry != &map->header;
+	     entry = entry->next) {
+		if (entry->protection != VM_PROT_NONE &&
+		    entry->wired_count == 0) {	/* not already wired? */
+			size += entry->end - entry->start;
+		}
+	}
+
+	if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
+		vm_map_unlock(map);
+		return (KERN_NO_SPACE);		/* XXX overloaded */
+	}
+
+	/* XXX non-pmap_wired_count case must be handled by caller */
+#ifdef pmap_wired_count
+	if (limit != 0 &&
+	    (size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit)) {
+		vm_map_unlock(map);
+		return (KERN_NO_SPACE);		/* XXX overloaded */
+	}
+#endif
+
+	/*
+	 * Pass 2.
+	 */
+
+	for (entry = map->header.next; entry != &map->header;
+	     entry = entry->next) {
+		if (entry->protection == VM_PROT_NONE)
+			continue;
+		if (entry->wired_count == 0) {	/* not already wired? */
+			/*
+			 * perform actions of vm_map_lookup that need the
+			 * write lock on the map: create an anonymous map
+			 * for a copy-on-write region, or an anonymous map
+			 * for a zero-fill region.  (XXXCDC: submap case
+			 * ok?)
+			 */
+			if (!UVM_ET_ISSUBMAP(entry)) {	/* not submap */
+				if (UVM_ET_ISNEEDSCOPY(entry) && 
+				    ((entry->protection & VM_PROT_WRITE) ||
+				     (entry->object.uvm_obj == NULL))) {
+					amap_copy(map, entry, M_WAITOK, TRUE,
+					    entry->start, entry->end);
+					/* XXXCDC: wait OK? */
+				}
+			}
+		} /* wired_count == 0 */
+		entry->wired_count++;
+	}
+
+	/*
+	 * Pass 3.
+	 */
+
+	vm_map_downgrade(map);
+
+	rv = KERN_SUCCESS;
+	for (entry = map->header.next; entry != &map->header;
+	     entry = entry->next) {
+		if (entry->wired_count == 1) {
+			rv = uvm_fault_wire(map, entry->start, entry->end,
+			     entry->protection);
+			if (rv) {
+				/*
+				 * wiring failed.  break out of the loop.
+				 * we'll clean up the map below, once we
+				 * have a write lock again.
+				 */
+				break;
+			}
+		}
+	}
+
+	if (rv) {	/* failed? */
+		/*
+		 * Get back an exclusive (write) lock.
+		 */
+		vm_map_upgrade(map);
+
+		/*
+		 * first drop the wiring count on all the entries
+		 * which haven't actually been wired yet.
+		 */
+		failed_entry = entry;
+		for (/* nothing */; entry != &map->header;
+		     entry = entry->next)
+			entry->wired_count--;
+
+		/*
+		 * now, unwire all the entries that were successfully
+		 * wired above.
+		 */
+		for (entry = map->header.next; entry != failed_entry;
+		     entry = entry->next) {
+			entry->wired_count--;
+			if (entry->wired_count == 0)
+				uvm_map_entry_unwire(map, entry);
+		}
+		vm_map_unlock(map);
+		UVMHIST_LOG(maphist,"<- done (RV=%d)", rv,0,0,0);
+		return (rv);
+	}
+
+	/* We are holding a read lock here. */
+	vm_map_unlock_read(map);
+
+	UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
+	return (KERN_SUCCESS);
+}
+
 /*
 * uvm_map_clean: push dirty pages off to backing store.
 *
@ -2480,6 +2690,14 @@ uvmspace_exec(p)
 			shmexit(ovm);
 #endif

+		/*
+		 * POSIX 1003.1b -- "lock future mappings" is revoked
+		 * when a process execs another program image.
+		 */
+		vm_map_lock(map);
+		map->flags &= ~VM_MAP_WIREFUTURE;
+		vm_map_unlock(map);
+
 		/*
 		 * now unmap the old program
 		 */
--- a/sys/uvm/uvm_mmap.c
+++ b/sys/uvm/uvm_mmap.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_mmap.c,v 1.21 1999/05/23 06:27:13 mrg Exp $	*/
+/*	$NetBSD: uvm_mmap.c,v 1.22 1999/06/15 23:27:47 thorpej Exp $	*/

 /*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -130,15 +130,140 @@ sys_mincore(p, v, retval)
 	void *v;
 	register_t *retval;
 {
-#if 0
 	struct sys_mincore_args /* {
-		syscallarg(caddr_t) addr;
+		syscallarg(void *) addr;
 		syscallarg(size_t) len;
 		syscallarg(char *) vec;
 	} */ *uap = v;
-#endif
+	vm_page_t m;
+	char *vec, pgi;
+	struct uvm_object *uobj;
+	struct vm_amap *amap;
+	struct vm_anon *anon;
+	vm_map_entry_t entry;
+	vaddr_t start, end, lim;
+	vm_map_t map;
+	vsize_t len;
+	int error = 0, npgs;

-	return (ENOSYS);
+	map = &p->p_vmspace->vm_map;
+
+	start = (vaddr_t)SCARG(uap, addr);
+	len = SCARG(uap, len);
+	vec = SCARG(uap, vec);
+
+	if (start & PAGE_MASK)
+		return (EINVAL);
+	len = round_page(len);
+	end = start + len;
+	if (end <= start)
+		return (EINVAL);
+
+	npgs = len >> PAGE_SHIFT;
+
+	if (uvm_useracc(vec, npgs, B_WRITE) == FALSE)
+		return (EFAULT);
+
+	/*
+	 * Lock down vec, so our returned status isn't outdated by
+	 * storing the status byte for a page.
+	 */
+	uvm_vslock(p, vec, npgs, VM_PROT_WRITE);
+
+	vm_map_lock_read(map);
+
+	if (uvm_map_lookup_entry(map, start, &entry) == FALSE) {
+		error = ENOMEM;
+		goto out;
+	}
+
+	for (/* nothing */;
+	     entry != &map->header && entry->start < end;
+	     entry = entry->next) {
+#ifdef DIAGNOSTIC
+		if (UVM_ET_ISSUBMAP(entry))
+			panic("mincore: user map has submap");
+		if (start < entry->start)
+			panic("mincore: hole");
+#endif
+		/* Make sure there are no holes. */
+		if (entry->end < end &&
+		     (entry->next == &map->header ||
+		      entry->next->start > entry->end)) {
+			error = ENOMEM;
+			goto out;
+		}
+
+		lim = end < entry->end ? end : entry->end;
+
+		/*
+		 * Special case for mapped devices; these are always
+		 * considered resident.
+		 */
+		if (UVM_ET_ISOBJ(entry)) {
+			extern struct uvm_pagerops uvm_deviceops; /* XXX */
+#ifdef DIAGNOSTIC
+			if (UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj))
+				panic("mincore: user map has kernel object");
+#endif
+			if (entry->object.uvm_obj->pgops == &uvm_deviceops) {
+				for (/* nothing */; start < lim;
+				     start += PAGE_SIZE, vec++)
+					subyte(vec, 1);
+				continue;
+			}
+		}
+
+		uobj = entry->object.uvm_obj;	/* top layer */
+		amap = entry->aref.ar_amap;	/* bottom layer */
+
+		if (amap != NULL)
+			amap_lock(amap);
+		if (uobj != NULL)
+			simple_lock(&uobj->vmobjlock);
+
+		for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
+			pgi = 0;
+			if (amap != NULL) {
+				/* Check the top layer first. */
+				anon = amap_lookup(&entry->aref,
+				    start - entry->start);
+				/* Don't need to lock anon here. */
+				if (anon != NULL && anon->u.an_page != NULL) {
+					/*
+					 * Anon has the page for this entry
+					 * offset.
+					 */
+					pgi = 1;
+				}
+			}
+
+			if (uobj != NULL && pgi == 0) {
+				/* Check the bottom layer. */
+				m = uvm_pagelookup(uobj,
+				    entry->offset + (start - entry->start));
+				if (m != NULL) {
+					/*
+					 * Object has the page for this entry
+					 * offset.
+					 */
+					pgi = 1;
+				}
+			}
+
+			(void) subyte(vec, pgi);
+		}
+
+		if (uobj != NULL)
+			simple_unlock(&obj->vmobjlock);
+		if (amap != NULL)
+			amap_unlock(amap);
+	}
+
+ out:
+	vm_map_unlock_read(map);
+	uvm_vsunlock(p, SCARG(uap, vec), npgs);
+	return (error);
 }

 #if 0
@ -816,6 +941,73 @@ sys_munlock(p, v, retval)
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }

+/*
+ * sys_mlockall: lock all pages mapped into an address space.
+ */
+
+int
+sys_mlockall(p, v, retval)
+	struct proc *p;
+	void *v;
+	register_t *retval;
+{
+	struct sys_mlockall_args /* {
+		syscallarg(int) flags;
+	} */ *uap = v;
+	vsize_t limit;
+	int error, flags;
+
+	flags = SCARG(uap, flags);
+
+	if (flags == 0 ||
+	    (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
+		return (EINVAL);
+
+#ifdef pmap_wired_count
+	/* Actually checked in uvm_map_pageable_all() */
+	limit = p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur];
+#else
+	limit = 0;
+	if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
+		return (error);
+#endif
+
+	error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags, limit);
+	switch (error) {
+	case KERN_SUCCESS:
+		error = 0;
+		break;
+
+	case KERN_NO_SPACE:	/* XXX overloaded */
+		error = ENOMEM;
+		break;
+
+	default:
+		/*
+		 * "Some or all of the memory could not be locked when
+		 * the call was made."
+		 */
+		error = EAGAIN;
+	}
+
+	return (error);
+}
+
+/*
+ * sys_munlockall: unlock all pages mapped into an address space.
+ */
+
+int
+sys_munlockall(p, v, retval)
+	struct proc *p;
+	void *v;
+	register_t *retval;
+{
+
+	(void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
+	return (0);
+}
+
 /*
 * uvm_mmap: internal version of mmap
 *
--- a/sys/vm/vm_map.h
+++ b/sys/vm/vm_map.h
@ -1,4 +1,4 @@
-/*	$NetBSD: vm_map.h,v 1.29 1999/06/07 16:34:04 thorpej Exp $	*/
+/*	$NetBSD: vm_map.h,v 1.30 1999/06/15 23:27:48 thorpej Exp $	*/

 /* 
 * Copyright (c) 1991, 1993
@ -140,15 +140,23 @@ struct vm_map {
 	vm_map_entry_t		hint;		/* hint for quick lookups */
 	simple_lock_data_t	hint_lock;	/* lock for hint storage */
 	vm_map_entry_t		first_free;	/* First free space hint */
-	int			flags;		/* flags (read-only) */
+	/*
+	 * Locking note: read-only flags need not be locked to read
+	 * them; they are set once at map creation time, and never
+	 * changed again.  Only read-write flags require that the
+	 * appropriate map lock be acquired before reading or writing
+	 * the flag.
+	 */
+	int			flags;		/* flags */
 	unsigned int		timestamp;	/* Version number */
 #define	min_offset		header.start
 #define max_offset		header.end
 };

 /* vm_map flags */
-#define	VM_MAP_PAGEABLE		0x01		/* entries are pageable */
-#define	VM_MAP_INTRSAFE		0x02		/* interrupt safe map */
+#define	VM_MAP_PAGEABLE		0x01		/* ro: entries are pageable */
+#define	VM_MAP_INTRSAFE		0x02		/* ro: interrupt safe map */
+#define	VM_MAP_WIREFUTURE	0x04		/* rw: wire future mappings */

 /*
 *	Interrupt-safe maps must also be kept on a special list,