Index: kern/vfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.242.2.17
diff -u -r1.242.2.17 vfs_bio.c
--- kern/vfs_bio.c	29 Jun 2002 16:38:27 -0000	1.242.2.17
+++ kern/vfs_bio.c	6 Sep 2002 21:41:06 -0000
@@ -442,11 +442,13 @@
 {
 	if (bp->b_kvasize) {
 		++buffreekvacnt;
+		vm_map_lock(buffer_map);
 		bufspace -= bp->b_kvasize;
 		vm_map_delete(buffer_map,
 		    (vm_offset_t) bp->b_kvabase,
 		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize
 		);
+		vm_map_unlock(buffer_map);
 		bp->b_kvasize = 0;
 		bufspacewakeup();
 	}
@@ -1740,12 +1742,15 @@
 
 			bfreekva(bp);
 
+			vm_map_lock(buffer_map);
+
 			if (vm_map_findspace(buffer_map,
 				vm_map_min(buffer_map), maxsize, &addr)) {
 				/*
 				 * Uh oh.  Buffer map is to fragmented.  We
 				 * must defragment the map.
 				 */
+				vm_map_unlock(buffer_map);
 				++bufdefragcnt;
 				defrag = 1;
 				bp->b_flags |= B_INVAL;
@@ -1762,6 +1767,7 @@
 				bufspace += bp->b_kvasize;
 				++bufreusecnt;
 			}
+			vm_map_unlock(buffer_map);
 		}
 		bp->b_data = bp->b_kvabase;
 	}
Index: sys/vmmeter.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/vmmeter.h,v
retrieving revision 1.21.2.1
diff -u -r1.21.2.1 vmmeter.h
--- sys/vmmeter.h	18 Feb 2001 15:41:11 -0000	1.21.2.1
+++ sys/vmmeter.h	8 Sep 2002 01:29:22 -0000
@@ -103,6 +103,8 @@
 	u_int v_vforkpages;	/* number of VM pages affected by vfork() */
 	u_int v_rforkpages;	/* number of VM pages affected by rfork() */
 	u_int v_kthreadpages;	/* number of VM pages affected by fork() by kernel */
+	u_int v_intrans_coll;	/* intransit map collisions (total) */
+	u_int v_intrans_wait;	/* intransit map collisions which blocked */
 };
 #ifdef _KERNEL
 
Index: vm/vm_map.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_map.c,v
retrieving revision 1.187.2.14
diff -u -r1.187.2.14 vm_map.c
--- vm/vm_map.c	2 Jul 2002 20:06:18 -0000	1.187.2.14
+++ vm/vm_map.c	8 Sep 2002 19:38:39 -0000
@@ -146,6 +146,7 @@
 static void vm_map_copy_entry __P((vm_map_t, vm_map_t, vm_map_entry_t,
 		vm_map_entry_t));
 static void vm_map_split __P((vm_map_entry_t));
+static void vm_map_unclip_range __P((vm_map_t map, vm_map_entry_t start_entry, vm_offset_t start, vm_offset_t end, int flags));
 
 void
 vm_map_startup()
@@ -355,9 +356,13 @@
 vm_map_entry_unlink(vm_map_t map,
 		    vm_map_entry_t entry)
 {
-	vm_map_entry_t prev = entry->prev;
-	vm_map_entry_t next = entry->next;
+	vm_map_entry_t prev;
+	vm_map_entry_t next;
 
+	if (entry->eflags & MAP_ENTRY_IN_TRANSITION)
+		panic("vm_map_entry_unlink: attempt to mess with locked entry! %p", entry);
+	prev = entry->prev;
+	next = entry->next;
 	next->prev = prev;
 	prev->next = next;
 	map->nentries--;
@@ -734,7 +739,8 @@
  *
  *	This routine guarentees that the passed entry remains valid (though
  *	possibly extended).  When merging, this routine may delete one or
- *	both neighbors.
+ *	both neighbors.  No action is taken on entries which have their
+ *	in-transition flag set.
  */
 void
 vm_map_simplify_entry(map, entry)
@@ -744,8 +750,10 @@
 	vm_map_entry_t next, prev;
 	vm_size_t prevsize, esize;
 
-	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
+	if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) {
+		++cnt.v_intrans_coll;
 		return;
+	}
 
 	prev = entry->prev;
 	if (prev != &map->header) {
@@ -935,6 +943,208 @@
 		}
 
 /*
+ *	vm_map_transition_wait:	[ kernel use only ]
+ *
+ *	Used to block when an in-transition collison occurs.  The map
+ *	is unlocked for the sleep and relocked before the return.
+ */
+static
+void
+vm_map_transition_wait(vm_map_t map)
+{
+	vm_map_unlock(map);
+	tsleep(map, PVM, "vment", 0);
+	vm_map_lock(map);
+}
+
+/*
+ * CLIP_CHECK_BACK
+ * CLIP_CHECK_FWD
+ *
+ *	When we do blocking operations with the map lock held it is
+ *	possible that a clip might have occured on our in-transit entry,
+ *	requiring an adjustment to the entry in our loop.  These macros
+ *	help the pageable and clip_range code deal with the case.  The
+ *	conditional costs virtually nothing if no clipping has occured.
+ */
+
+#define CLIP_CHECK_BACK(entry, save_start)		\
+    do {						\
+	    while (entry->start != save_start) {	\
+		    entry = entry->prev;		\
+		    KASSERT(entry != &map->header, ("bad entry clip")); \
+	    }						\
+    } while(0)
+
+#define CLIP_CHECK_FWD(entry, save_end)			\
+    do {						\
+	    while (entry->end != save_end) {		\
+		    entry = entry->next;		\
+		    KASSERT(entry != &map->header, ("bad entry clip")); \
+	    }						\
+    } while(0)
+
+
+/*
+ *	vm_map_clip_range:	[ kernel use only ]
+ *
+ *	Clip the specified range and return the base entry.  The
+ *	range may cover several entries starting at the returned base
+ *	and the first and last entry in the covering sequence will be
+ *	properly clipped to the requested start and end address.
+ *
+ *	If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
+ *	flag.  
+ *
+ *	The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
+ *	covered by the requested range.
+ *
+ *	The map must be exclusively locked on entry and will remain locked
+ *	on return. If no range exists or the range contains holes and you
+ *	specified that no holes were allowed, NULL will be returned.  This
+ *	routine may temporarily unlock the map in order avoid a deadlock when
+ *	sleeping.
+ */
+static
+vm_map_entry_t
+vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
+{
+	vm_map_entry_t start_entry;
+	vm_map_entry_t entry;
+
+	/*
+	 * Locate the entry and effect initial clipping.  The in-transition
+	 * case does not occur very often so do not try to optimize it.
+	 */
+again:
+	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
+		return (NULL);
+	entry = start_entry;
+	if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
+		entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
+		++cnt.v_intrans_coll;
+		++cnt.v_intrans_wait;
+		vm_map_transition_wait(map);
+		/*
+		 * entry and/or start_entry may have been clipped while
+		 * we slept, or may have gone away entirely.  We have
+		 * to restart from the lookup.
+		 */
+		goto again;
+	}
+	/*
+	 * Since we hold an exclusive map lock we do not have to restart
+	 * after clipping, even though clipping may block in zalloc.
+	 */
+	vm_map_clip_start(map, entry, start);
+	vm_map_clip_end(map, entry, end);
+	entry->eflags |= MAP_ENTRY_IN_TRANSITION;
+
+	/*
+	 * Scan entries covered by the range.  When working on the next
+	 * entry a restart need only re-loop on the current entry which
+	 * we have already locked, since 'next' may have changed.  Also,
+	 * even though entry is safe, it may have been clipped so we
+	 * have to iterate forwards through the clip after sleeping.
+	 */
+	while (entry->next != &map->header && entry->next->start < end) {
+		vm_map_entry_t next = entry->next;
+
+		if (flags & MAP_CLIP_NO_HOLES) {
+			if (next->start > entry->end) {
+				vm_map_unclip_range(map, start_entry,
+					start, entry->end, flags);
+				return(NULL);
+			}
+		}
+
+		if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
+			vm_offset_t save_end = entry->end;
+			next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
+			++cnt.v_intrans_coll;
+			++cnt.v_intrans_wait;
+			vm_map_transition_wait(map);
+
+			/*
+			 * clips might have occured while we blocked.
+			 */
+			CLIP_CHECK_FWD(entry, save_end);
+			CLIP_CHECK_BACK(start_entry, start);
+			continue;
+		}
+		/*
+		 * No restart necessary even though clip_end may block, we
+		 * are holding the map lock.
+		 */
+		vm_map_clip_end(map, next, end);
+		next->eflags |= MAP_ENTRY_IN_TRANSITION;
+		entry = next;
+	}
+	if (flags & MAP_CLIP_NO_HOLES) {
+		if (entry->end != end) {
+			vm_map_unclip_range(map, start_entry,
+				start, entry->end, flags);
+			return(NULL);
+		}
+	}
+	return(start_entry);
+}
+
+/*
+ *	vm_map_unclip_range:	[ kernel use only ]
+ *
+ *	Undo the effect of vm_map_clip_range().  You should pass the same
+ *	flags and the same range that you passed to vm_map_clip_range().
+ *	This code will clear the in-transition flag on the entries and
+ *	wake up anyone waiting.  This code will also simplify the sequence 
+ *	and attempt to merge it with entries before and after the sequence.
+ *
+ *	The map must be locked on entry and will remain locked on return.
+ *
+ *	Note that you should also pass the start_entry returned by 
+ *	vm_map_clip_range().  However, if you block between the two calls
+ *	with the map unlocked please be aware that the start_entry may
+ *	have been clipped and you may need to scan it backwards to find
+ *	the entry corresponding with the original start address.  You are
+ *	responsible for this, vm_map_unclip_range() expects the correct
+ *	start_entry to be passed to it and will KASSERT otherwise.
+ */
+static
+void
+vm_map_unclip_range(
+	vm_map_t map,
+	vm_map_entry_t start_entry,
+	vm_offset_t start,
+	vm_offset_t end,
+	int flags)
+{
+	vm_map_entry_t entry;
+
+	entry = start_entry;
+
+	KASSERT(entry->start == start, ("unclip_range: illegal base entry"));
+	while (entry != &map->header && entry->start < end) {
+		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, ("in-transition flag not set during unclip on: %p", entry));
+		KASSERT(entry->end <= end, ("unclip_range: tail wasn't clipped"));
+		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
+		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
+			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
+			wakeup(map);
+		}
+		entry = entry->next;
+	}
+
+	/*
+	 * Simplification does not block so there is no restart case.
+	 */
+	entry = start_entry;
+	while (entry != &map->header && entry->start < end) {
+		vm_map_simplify_entry(map, entry);
+		entry = entry->next;
+	}
+}
+
+/*
  *	vm_map_submap:		[ kernel use only ]
  *
  *	Mark the given range as handled by a subordinate map.
@@ -968,8 +1178,9 @@
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
-	} else
+	} else {
 		entry = entry->next;
+	}
 
 	vm_map_clip_end(map, entry, end);
 
@@ -1279,58 +1490,40 @@
  * Implement the semantics of mlock
  */
 int
-vm_map_user_pageable(map, start, end, new_pageable)
+vm_map_user_pageable(map, start, real_end, new_pageable)
 	vm_map_t map;
 	vm_offset_t start;
-	vm_offset_t end;
+	vm_offset_t real_end;
 	boolean_t new_pageable;
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t start_entry;
-	vm_offset_t estart;
-	vm_offset_t eend;
-	int rv;
+	vm_offset_t end;
+	int rv = KERN_SUCCESS;
 
 	vm_map_lock(map);
-	VM_MAP_RANGE_CHECK(map, start, end);
+	VM_MAP_RANGE_CHECK(map, start, real_end);
+	end = real_end;
 
-	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) {
+	start_entry = vm_map_clip_range(map, start, end, MAP_CLIP_NO_HOLES);
+	if (start_entry == NULL) {
 		vm_map_unlock(map);
 		return (KERN_INVALID_ADDRESS);
 	}
 
-	if (new_pageable) {
-
+	if (new_pageable == 0) {
 		entry = start_entry;
-		vm_map_clip_start(map, entry, start);
-
-		/*
-		 * Now decrement the wiring count for each region. If a region
-		 * becomes completely unwired, unwire its physical pages and
-		 * mappings.
-		 */
-		while ((entry != &map->header) && (entry->start < end)) {
-			if (entry->eflags & MAP_ENTRY_USER_WIRED) {
-				vm_map_clip_end(map, entry, end);
-				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
-				entry->wired_count--;
-				if (entry->wired_count == 0)
-					vm_fault_unwire(map, entry->start, entry->end);
-			}
-			vm_map_simplify_entry(map,entry);
-			entry = entry->next;
-		}
-	} else {
-
-		entry = start_entry;
-
 		while ((entry != &map->header) && (entry->start < end)) {
+			vm_offset_t save_start;
+			vm_offset_t save_end;
 
+			/*
+			 * Already user wired or hard wired (trivial cases)
+			 */
 			if (entry->eflags & MAP_ENTRY_USER_WIRED) {
 				entry = entry->next;
 				continue;
 			}
-			
 			if (entry->wired_count != 0) {
 				entry->wired_count++;
 				entry->eflags |= MAP_ENTRY_USER_WIRED;
@@ -1338,8 +1531,11 @@
 				continue;
 			}
 
-			/* Here on entry being newly wired */
-
+			/*
+			 * A new wiring requires instantiation of appropriate
+			 * management structures and the faulting in of the
+			 * page.
+			 */
 			if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 				int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
 				if (copyflag && ((entry->protection & VM_PROT_WRITE) != 0)) {
@@ -1359,65 +1555,98 @@
 
 				}
 			}
-
-			vm_map_clip_start(map, entry, start);
-			vm_map_clip_end(map, entry, end);
-
 			entry->wired_count++;
 			entry->eflags |= MAP_ENTRY_USER_WIRED;
-			estart = entry->start;
-			eend = entry->end;
 
-			/* First we need to allow map modifications */
-			vm_map_set_recursive(map);
-			vm_map_lock_downgrade(map);
+			/*
+			 * Now fault in the area.  The map lock needs to be
+			 * manipulated to avoid deadlocks.  The in-transition
+			 * flag protects the entries. 
+			 */
+			save_start = entry->start;
+			save_end = entry->end;
+			vm_map_unlock(map);
 			map->timestamp++;
-
-			rv = vm_fault_user_wire(map, entry->start, entry->end);
+			rv = vm_fault_user_wire(map, save_start, save_end);
+			vm_map_lock(map);
 			if (rv) {
+				CLIP_CHECK_BACK(entry, save_start);
+				for (;;) {
+					KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
+					entry->eflags &= ~MAP_ENTRY_USER_WIRED;
+					entry->wired_count = 0;
+					if (entry->end == save_end)
+						break;
+					entry = entry->next;
+					KASSERT(entry != &map->header, ("bad entry clip during backout"));
+				}
+				end = save_start;	/* unwire the rest */
+				break;
+			}
+			/*
+			 * note that even though the entry might have been
+			 * clipped, the USER_WIRED flag we set prevents
+			 * duplication so we do not have to do a 
+			 * clip check.
+			 */
+			entry = entry->next;
+		}
 
-				entry->wired_count--;
-				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
+		/*
+		 * If we failed fall through to the unwiring section to
+		 * unwire what we had wired so far.  'end' has already
+		 * been adjusted.
+		 */
+		if (rv)
+			new_pageable = 1;
 
-				vm_map_clear_recursive(map);
-				vm_map_unlock(map);
-			
-				/*
-				 * At this point, the map is unlocked, and
-				 * entry might no longer be valid.  Use copy
-				 * of entry start value obtained while entry
-				 * was valid.
-				 */
-				(void) vm_map_user_pageable(map, start, estart,
-							    TRUE);
-				return rv;
-			}
-
-			vm_map_clear_recursive(map);
-			if (vm_map_lock_upgrade(map)) {
-				vm_map_lock(map);
-				if (vm_map_lookup_entry(map, estart, &entry) 
-				    == FALSE) {
-					vm_map_unlock(map);
-					/* 
-					 * vm_fault_user_wire succeded, thus
-					 * the area between start and eend
-					 * is wired and has to be unwired
-					 * here as part of the cleanup.
-					 */
-					(void) vm_map_user_pageable(map,
-								    start,
-								    eend,
-								    TRUE);
-					return (KERN_INVALID_ADDRESS);
-				}
+		/*
+		 * start_entry might have been clipped if we unlocked the
+		 * map and blocked.  No matter how clipped it has gotten
+		 * there should be a fragment that is on our start boundary.
+		 */
+		CLIP_CHECK_BACK(start_entry, start);
+	}
+
+	/*
+	 * Deal with the unwiring case.
+	 */
+	if (new_pageable) {
+		/*
+		 * This is the unwiring case.  We must first ensure that the
+		 * range to be unwired is really wired down.  We know there
+		 * are no holes.
+		 */
+		entry = start_entry;
+		while ((entry != &map->header) && (entry->start < end)) {
+			if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
+				rv = KERN_INVALID_ARGUMENT;
+				goto done;
 			}
-			vm_map_simplify_entry(map,entry);
+			KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry));
+			entry = entry->next;
+		}
+
+		/*
+		 * Now decrement the wiring count for each region. If a region
+		 * becomes completely unwired, unwire its physical pages and
+		 * mappings.
+		 */
+		while ((entry != &map->header) && (entry->start < end)) {
+			KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED, ("expected USER_WIRED on entry %p", entry));
+			entry->eflags &= ~MAP_ENTRY_USER_WIRED;
+			entry->wired_count--;
+			if (entry->wired_count == 0)
+				vm_fault_unwire(map, entry->start, entry->end);
+			entry = entry->next;
 		}
 	}
+done:
+	vm_map_unclip_range(map, start_entry, start, real_end, 
+		MAP_CLIP_NO_HOLES);
 	map->timestamp++;
 	vm_map_unlock(map);
-	return KERN_SUCCESS;
+	return (rv);
 }
 
 /*
@@ -1432,80 +1661,30 @@
  *	must remain to the map throughout the call.
  */
 int
-vm_map_pageable(map, start, end, new_pageable)
+vm_map_pageable(map, start, real_end, new_pageable)
 	vm_map_t map;
 	vm_offset_t start;
-	vm_offset_t end;
+	vm_offset_t real_end;
 	boolean_t new_pageable;
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t start_entry;
-	vm_offset_t failed = 0;
-	int rv;
+	vm_offset_t end;
+	int rv = KERN_SUCCESS;
+	int s;
 
 	vm_map_lock(map);
+	VM_MAP_RANGE_CHECK(map, start, real_end);
+	end = real_end;
 
-	VM_MAP_RANGE_CHECK(map, start, end);
-
-	/*
-	 * Only one pageability change may take place at one time, since
-	 * vm_fault assumes it will be called only once for each
-	 * wiring/unwiring.  Therefore, we have to make sure we're actually
-	 * changing the pageability for the entire region.  We do so before
-	 * making any changes.
-	 */
-
-	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) {
+	start_entry = vm_map_clip_range(map, start, end, MAP_CLIP_NO_HOLES);
+	if (start_entry == NULL) {
 		vm_map_unlock(map);
 		return (KERN_INVALID_ADDRESS);
 	}
-	entry = start_entry;
-
-	/*
-	 * Actions are rather different for wiring and unwiring, so we have
-	 * two separate cases.
-	 */
-
-	if (new_pageable) {
-
-		vm_map_clip_start(map, entry, start);
-
-		/*
-		 * Unwiring.  First ensure that the range to be unwired is
-		 * really wired down and that there are no holes.
-		 */
-		while ((entry != &map->header) && (entry->start < end)) {
-
-			if (entry->wired_count == 0 ||
-			    (entry->end < end &&
-				(entry->next == &map->header ||
-				    entry->next->start > entry->end))) {
-				vm_map_unlock(map);
-				return (KERN_INVALID_ARGUMENT);
-			}
-			entry = entry->next;
-		}
-
-		/*
-		 * Now decrement the wiring count for each region. If a region
-		 * becomes completely unwired, unwire its physical pages and
-		 * mappings.
-		 */
-		entry = start_entry;
-		while ((entry != &map->header) && (entry->start < end)) {
-			vm_map_clip_end(map, entry, end);
-
-			entry->wired_count--;
-			if (entry->wired_count == 0)
-				vm_fault_unwire(map, entry->start, entry->end);
-
-			vm_map_simplify_entry(map, entry);
-
-			entry = entry->next;
-		}
-	} else {
+	if (new_pageable == 0) {
 		/*
-		 * Wiring.  We must do this in two passes:
+		 * Wiring.  
 		 *
 		 * 1.  Holding the write lock, we create any shadow or zero-fill
 		 * objects that need to be created. Then we clip each map
@@ -1517,9 +1696,9 @@
 		 * fault in the pages for any newly wired area (wired_count is
 		 * 1).
 		 *
-		 * Downgrading to a read lock for vm_fault_wire avoids a possible
-		 * deadlock with another process that may have faulted on one
-		 * of the pages to be wired (it would mark the page busy,
+		 * Downgrading to a read lock for vm_fault_wire avoids a 
+		 * possible deadlock with another process that may have faulted
+		 * on one of the pages to be wired (it would mark the page busy,
 		 * blocking us, then in turn block on the map lock that we
 		 * hold).  Because of problems in the recursive lock package,
 		 * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
@@ -1529,62 +1708,44 @@
 		 * change.
 		 */
 
-		/*
-		 * Pass 1.
-		 */
+		entry = start_entry;
 		while ((entry != &map->header) && (entry->start < end)) {
-			if (entry->wired_count == 0) {
-
-				/*
-				 * Perform actions of vm_map_lookup that need
-				 * the write lock on the map: create a shadow
-				 * object for a copy-on-write region, or an
-				 * object for a zero-fill region.
-				 *
-				 * We don't have to do this for entries that
-				 * point to sub maps, because we won't
-				 * hold the lock on the sub map.
-				 */
-				if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
-					int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
-					if (copyflag &&
-					    ((entry->protection & VM_PROT_WRITE) != 0)) {
-
-						vm_object_shadow(&entry->object.vm_object,
-						    &entry->offset,
-						    atop(entry->end - entry->start));
-						entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
-					} else if (entry->object.vm_object == NULL &&
-						   !map->system_map) {
-						entry->object.vm_object =
-						    vm_object_allocate(OBJT_DEFAULT,
-							atop(entry->end - entry->start));
-						entry->offset = (vm_offset_t) 0;
-					}
-				}
+			/*
+			 * Trivial case if the entry is already wired
+			 */
+			if (entry->wired_count) {
+				entry->wired_count++;
+				entry = entry->next;
+				continue;
 			}
-			vm_map_clip_start(map, entry, start);
-			vm_map_clip_end(map, entry, end);
-			entry->wired_count++;
 
 			/*
-			 * Check for holes
+			 * The entry is being newly wired, we have to setup
+			 * appropriate management structures.  A shadow 
+			 * object is required for a copy-on-write region,
+			 * or a normal object for a zero-fill region.  We
+			 * do not have to do this for entries that point to sub
+			 * maps because we won't hold the lock on the sub map.
 			 */
-			if (entry->end < end &&
-			    (entry->next == &map->header ||
-				entry->next->start > entry->end)) {
-				/*
-				 * Found one.  Object creation actions do not
-				 * need to be undone, but the wired counts
-				 * need to be restored.
-				 */
-				while (entry != &map->header && entry->end > start) {
-					entry->wired_count--;
-					entry = entry->prev;
+			if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
+				int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
+				if (copyflag &&
+				    ((entry->protection & VM_PROT_WRITE) != 0)) {
+
+					vm_object_shadow(&entry->object.vm_object,
+					    &entry->offset,
+					    atop(entry->end - entry->start));
+					entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
+				} else if (entry->object.vm_object == NULL &&
+					   !map->system_map) {
+					entry->object.vm_object =
+					    vm_object_allocate(OBJT_DEFAULT,
+						atop(entry->end - entry->start));
+					entry->offset = (vm_offset_t) 0;
 				}
-				vm_map_unlock(map);
-				return (KERN_INVALID_ARGUMENT);
 			}
+
+			entry->wired_count++;
 			entry = entry->next;
 		}
 
@@ -1595,22 +1756,22 @@
 		/*
 		 * HACK HACK HACK HACK
 		 *
-		 * If we are wiring in the kernel map or a submap of it,
-		 * unlock the map to avoid deadlocks.  We trust that the
-		 * kernel is well-behaved, and therefore will not do
-		 * anything destructive to this region of the map while
-		 * we have it unlocked.  We cannot trust user processes
-		 * to do the same.
+		 * Unlock the map to avoid deadlocks.  The in-transit flag
+		 * protects us from most changes but note that
+		 * clipping may still occur.  To prevent clipping from
+		 * occuring after the unlock, except for when we are
+		 * blocking in vm_fault_wire, we must run at splvm().
+		 * Otherwise our accesses to entry->start and entry->end
+		 * could be corrupted.  We have to set splvm() prior to
+		 * unlocking so start_entry does not change out from
+		 * under us at the very beginning of the loop.
 		 *
 		 * HACK HACK HACK HACK
 		 */
-		if (vm_map_pmap(map) == kernel_pmap) {
-			vm_map_unlock(map);	/* trust me ... */
-		} else {
-			vm_map_lock_downgrade(map);
-		}
 
-		rv = 0;
+		s = splvm();
+		vm_map_unlock(map);
+
 		entry = start_entry;
 		while (entry != &map->header && entry->start < end) {
 			/*
@@ -1618,50 +1779,87 @@
 			 * what has been done.  We decrement the wiring count
 			 * for those pages which have not yet been wired (now)
 			 * and unwire those that have (later).
-			 *
-			 * XXX this violates the locking protocol on the map,
-			 * needs to be fixed.
 			 */
-			if (rv)
-				entry->wired_count--;
-			else if (entry->wired_count == 1) {
+			vm_offset_t save_start = entry->start;
+			vm_offset_t save_end = entry->end;
+
+			if (entry->wired_count == 1)
 				rv = vm_fault_wire(map, entry->start, entry->end);
-				if (rv) {
-					failed = entry->start;
-					entry->wired_count--;
+			if (rv) {
+				CLIP_CHECK_BACK(entry, save_start);
+				for (;;) {
+					KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly"));
+					entry->wired_count = 0;
+					if (entry->end == save_end)
+						break;
+					entry = entry->next;
+					KASSERT(entry != &map->header, ("bad entry clip during backout"));
 				}
+				end = save_start;
+				break;
 			}
+			CLIP_CHECK_FWD(entry, save_end);
 			entry = entry->next;
 		}
+		splx(s);
 
-		if (vm_map_pmap(map) == kernel_pmap) {
-			vm_map_lock(map);
-		}
-		if (rv) {
-			vm_map_unlock(map);
-			(void) vm_map_pageable(map, start, failed, TRUE);
-			return (rv);
-		}
 		/*
-		 * An exclusive lock on the map is needed in order to call
-		 * vm_map_simplify_entry().  If the current lock on the map
-		 * is only a shared lock, an upgrade is needed.
+		 * relock.  start_entry is still IN_TRANSITION and must
+		 * still exist, but may have been clipped (handled just
+		 * below).
 		 */
-		if (vm_map_pmap(map) != kernel_pmap &&
-		    vm_map_lock_upgrade(map)) {
-			vm_map_lock(map);
-			if (vm_map_lookup_entry(map, start, &start_entry) ==
-			    FALSE) {
-				vm_map_unlock(map);
-				return KERN_SUCCESS;
+		vm_map_lock(map);
+
+		/*
+		 * If a failure occured undo everything by falling through
+		 * to the unwiring code.  'end' has already been adjusted
+		 * appropriately.
+		 */
+		if (rv)
+			new_pageable = 1;
+
+		/*
+		 * start_entry might have been clipped if we unlocked the
+		 * map and blocked.  No matter how clipped it has gotten
+		 * there should be a fragment that is on our start boundary.
+		 */
+		CLIP_CHECK_BACK(start_entry, start);
+	}
+
+	if (new_pageable) {
+		/*
+		 * This is the unwiring case.  We must first ensure that the
+		 * range to be unwired is really wired down.  We know there
+		 * are no holes.
+		 */
+		entry = start_entry;
+		while ((entry != &map->header) && (entry->start < end)) {
+			if (entry->wired_count == 0) {
+				rv = KERN_INVALID_ARGUMENT;
+				goto done;
 			}
+			entry = entry->next;
 		}
-		vm_map_simplify_entry(map, start_entry);
-	}
 
+		/*
+		 * Now decrement the wiring count for each region. If a region
+		 * becomes completely unwired, unwire its physical pages and
+		 * mappings.
+		 */
+		entry = start_entry;
+		while ((entry != &map->header) && (entry->start < end)) {
+			entry->wired_count--;
+			if (entry->wired_count == 0)
+				vm_fault_unwire(map, entry->start, entry->end);
+			entry = entry->next;
+		}
+	}
+done:
+	vm_map_unclip_range(map, start_entry, start, real_end, 
+		MAP_CLIP_NO_HOLES);
+	map->timestamp++;
 	vm_map_unlock(map);
-
-	return (KERN_SUCCESS);
+	return (rv);
 }
 
 /*
@@ -1849,6 +2047,7 @@
 	 * Find the start of the region, and clip it
 	 */
 
+again:
 	if (!vm_map_lookup_entry(map, start, &first_entry))
 		entry = first_entry->next;
 	else {
@@ -1880,6 +2079,22 @@
 		vm_offset_t s, e;
 		vm_pindex_t offidxstart, offidxend, count;
 
+		/*
+		 * If we hit an in-transition entry we have to sleep and
+		 * retry.  It's easier (and not really slower) to just retry
+		 * since this case occurs so rarely and the hint is already
+		 * pointing at the right place.  We have to reset the
+		 * start offset so as not to accidently delete an entry
+		 * another process just created in vacated space.
+		 */
+		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
+			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
+			start = entry->start;
+			++cnt.v_intrans_coll;
+			++cnt.v_intrans_wait;
+			vm_map_transition_wait(map);
+			goto again;
+		}
 		vm_map_clip_end(map, entry, end);
 
 		s = entry->start;
@@ -2377,21 +2592,22 @@
 	vm_map_t map = &vm->vm_map;
 	vm_offset_t    end;
 	int      grow_amount;
-	int      rv;
+	int      rv = KERN_SUCCESS;
 	int      is_procstack;
+	int	 use_read_lock = 1;
+
 Retry:
-	vm_map_lock_read(map);
+	if (use_read_lock)
+		vm_map_lock_read(map);
+	else
+		vm_map_lock(map);
 
 	/* If addr is already in the entry range, no need to grow.*/
-	if (vm_map_lookup_entry(map, addr, &prev_entry)) {
-		vm_map_unlock_read(map);
-		return (KERN_SUCCESS);
-	}
+	if (vm_map_lookup_entry(map, addr, &prev_entry))
+		goto done;
 
-	if ((stack_entry = prev_entry->next) == &map->header) {
-		vm_map_unlock_read(map);
-		return (KERN_SUCCESS);
-	} 
+	if ((stack_entry = prev_entry->next) == &map->header)
+		goto done;
 	if (prev_entry == &map->header) 
 		end = stack_entry->start - stack_entry->avail_ssize;
 	else
@@ -2407,15 +2623,14 @@
 	if (stack_entry->avail_ssize < 1 ||
 	    addr >= stack_entry->start ||
 	    addr <  stack_entry->start - stack_entry->avail_ssize) {
-		vm_map_unlock_read(map);
-		return (KERN_SUCCESS);
+		goto done;
 	} 
 	
 	/* Find the minimum grow amount */
 	grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
 	if (grow_amount > stack_entry->avail_ssize) {
-		vm_map_unlock_read(map);
-		return (KERN_NO_SPACE);
+		rv = KERN_NO_SPACE;
+		goto done;
 	}
 
 	/* If there is no longer enough space between the entries
@@ -2428,13 +2643,14 @@
 	 * might have intended by limiting the stack size.
 	 */
 	if (grow_amount > stack_entry->start - end) {
-		if (vm_map_lock_upgrade(map))
+		if (use_read_lock && vm_map_lock_upgrade(map)) {
+			use_read_lock = 0;
 			goto Retry;
-
+		}
+		use_read_lock = 0;
 		stack_entry->avail_ssize = stack_entry->start - end;
-
-		vm_map_unlock(map);
-		return (KERN_NO_SPACE);
+		rv = KERN_NO_SPACE;
+		goto done;
 	}
 
 	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
@@ -2444,8 +2660,8 @@
 	 */
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
 			     p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
-		vm_map_unlock_read(map);
-		return (KERN_NO_SPACE);
+		rv = KERN_NO_SPACE;
+		goto done;
 	}
 
 	/* Round up the grow amount modulo SGROWSIZ */
@@ -2462,12 +2678,15 @@
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + grow_amount >
 	    curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
-		vm_map_unlock_read(map);
-		return (KERN_NO_SPACE);
+		rv = KERN_NO_SPACE;
+		goto done;
 	}
 
-	if (vm_map_lock_upgrade(map))
+	if (use_read_lock && vm_map_lock_upgrade(map)) {
+		use_read_lock = 0;
 		goto Retry;
+	}
+	use_read_lock = 0;
 
 	/* Get the preliminary new entry start value */
 	addr = stack_entry->start - grow_amount;
@@ -2503,9 +2722,12 @@
 		}
 	}
 
-	vm_map_unlock(map);
+done:
+	if (use_read_lock)
+		vm_map_unlock_read(map);
+	else
+		vm_map_unlock(map);
 	return (rv);
-
 }
 
 /*
@@ -2593,28 +2815,20 @@
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
+	int use_read_lock = 1;
+	int rv = KERN_SUCCESS;
 
-RetryLookup:;
-
-	/*
-	 * Lookup the faulting address.
-	 */
-
-	vm_map_lock_read(map);
-
-#define	RETURN(why) \
-		{ \
-		vm_map_unlock_read(map); \
-		return(why); \
-		}
+RetryLookup:
+	if (use_read_lock)
+		vm_map_lock_read(map);
+	else
+		vm_map_lock(map);
 
 	/*
 	 * If the map has an interesting hint, try it before calling full
 	 * blown lookup routine.
 	 */
-
 	entry = map->hint;
-
 	*out_entry = entry;
 
 	if ((entry == &map->header) ||
@@ -2625,8 +2839,10 @@
 		 * Entry was either not a valid hint, or the vaddr was not
 		 * contained in the entry, so do a full lookup.
 		 */
-		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry))
-			RETURN(KERN_INVALID_ADDRESS);
+		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
+			rv = KERN_INVALID_ADDRESS;
+			goto done;
+		}
 
 		entry = tmp_entry;
 		*out_entry = entry;
@@ -2640,7 +2856,11 @@
 		vm_map_t old_map = map;
 
 		*var_map = map = entry->object.sub_map;
-		vm_map_unlock_read(old_map);
+		if (use_read_lock)
+			vm_map_unlock_read(old_map);
+		else
+			vm_map_unlock(old_map);
+		use_read_lock = 1;
 		goto RetryLookup;
 	}
 
@@ -2658,14 +2878,16 @@
 
 	fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
 	if ((fault_type & prot) != fault_type) {
-			RETURN(KERN_PROTECTION_FAILURE);
+		rv = KERN_PROTECTION_FAILURE;
+		goto done;
 	}
 
 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
 	    (entry->eflags & MAP_ENTRY_COW) &&
 	    (fault_type & VM_PROT_WRITE) &&
 	    (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
-		RETURN(KERN_PROTECTION_FAILURE);
+		rv = KERN_PROTECTION_FAILURE;
+		goto done;
 	}
 
 	/*
@@ -2698,8 +2920,11 @@
 			 * object.
 			 */
 
-			if (vm_map_lock_upgrade(map))
+			if (use_read_lock && vm_map_lock_upgrade(map)) {
+				use_read_lock = 0;
 				goto RetryLookup;
+			}
+			use_read_lock = 0;
 
 			vm_object_shadow(
 			    &entry->object.vm_object,
@@ -2707,7 +2932,6 @@
 			    atop(entry->end - entry->start));
 
 			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
-			vm_map_lock_downgrade(map);
 		} else {
 			/*
 			 * We're attempting to read a copy-on-write page --
@@ -2723,13 +2947,14 @@
 	 */
 	if (entry->object.vm_object == NULL &&
 	    !map->system_map) {
-		if (vm_map_lock_upgrade(map)) 
+		if (use_read_lock && vm_map_lock_upgrade(map))  {
+			use_read_lock = 0;
 			goto RetryLookup;
-
+		}
+		use_read_lock = 0;
 		entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
 		    atop(entry->end - entry->start));
 		entry->offset = 0;
-		vm_map_lock_downgrade(map);
 	}
 
 	/*
@@ -2741,13 +2966,21 @@
 	*object = entry->object.vm_object;
 
 	/*
-	 * Return whether this is the only map sharing this data.
+	 * Return whether this is the only map sharing this data.  On
+	 * success we return with a read lock held on the map.  On failure
+	 * we return with the map unlocked.
 	 */
-
 	*out_prot = prot;
-	return (KERN_SUCCESS);
-
-#undef	RETURN
+done:
+	if (rv == KERN_SUCCESS) {
+		if (use_read_lock == 0)
+			vm_map_lock_downgrade(map);
+	} else if (use_read_lock) {
+		vm_map_unlock_read(map);
+	} else {
+		vm_map_unlock(map);
+	}
+	return (rv);
 }
 
 /*
Index: vm/vm_map.h
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_map.h,v
retrieving revision 1.54.2.3
diff -u -r1.54.2.3 vm_map.h
--- vm/vm_map.h	3 Nov 2001 00:59:15 -0000	1.54.2.3
+++ vm/vm_map.h	8 Sep 2002 05:58:16 -0000
@@ -128,7 +128,14 @@
 
 #define MAP_ENTRY_BEHAV_MASK		0x00C0
 
+#define MAP_ENTRY_IN_TRANSITION		0x0100	/* entry being changed */
+#define MAP_ENTRY_NEEDS_WAKEUP		0x0200	/* waiter's in transition */
 #define MAP_ENTRY_NOCOREDUMP		0x0400	/* don't include in a core */
+
+/*
+ * flags for vm_map_[un]clip_range()
+ */
+#define MAP_CLIP_NO_HOLES		0x0001
 
 static __inline u_char   
 vm_map_entry_behavior(struct vm_map_entry *entry)
Index: vm/vm_meter.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_meter.c,v
retrieving revision 1.34.2.6
diff -u -r1.34.2.6 vm_meter.c
--- vm/vm_meter.c	14 Nov 2001 17:22:53 -0000	1.34.2.6
+++ vm/vm_meter.c	8 Sep 2002 01:30:08 -0000
@@ -71,6 +71,10 @@
 	CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "");
 SYSCTL_UINT(_vm, OID_AUTO, v_free_severe,
 	CTLFLAG_RW, &cnt.v_free_severe, 0, "");
+SYSCTL_UINT(_vm, OID_AUTO, v_intrans_coll,
+	CTLFLAG_RW, &cnt.v_intrans_coll, 0, "");
+SYSCTL_UINT(_vm, OID_AUTO, v_intrans_wait,
+	CTLFLAG_RW, &cnt.v_intrans_wait, 0, "");
 
 SYSCTL_STRUCT(_vm, VM_LOADAVG, loadavg, CTLFLAG_RD, 
     &averunnable, loadavg, "Machine loadaverage history");
Index: vm/vm_zone.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/Attic/vm_zone.c,v
retrieving revision 1.30.2.5
diff -u -r1.30.2.5 vm_zone.c
--- vm/vm_zone.c	12 Aug 2002 23:39:08 -0000	1.30.2.5
+++ vm/vm_zone.c	16 Aug 2002 21:55:56 -0000
@@ -56,12 +56,21 @@
 		zerror(ZONE_ERROR_INVALID);
 #endif
 
-	if (z->zfreecnt <= z->zfreemin)
-		return _zget(z);
+	if (z->zfreecnt <= z->zfreemin) {
+		item = _zget(z);
+		/*
+		 * PANICFAIL allows the caller to assume that the zalloc()
+		 * will always succeed.  If it doesn't, we panic here.
+		 */
+		if (item == NULL && (z->zflags & ZONE_PANICFAIL))
+			panic("zalloc(%s) failed", z->zname);
+		return(item);
+	}
 
 	item = z->zitems;
 	z->zitems = ((void **) item)[0];
 #ifdef INVARIANTS
+	KASSERT(item != NULL, ("zitems unexpectedly NULL"));
 	if (((void **) item)[1] != (void *) ZENTRY_FREE)
 		zerror(ZONE_ERROR_NOTFREE);
 	((void **) item)[1] = 0;