Index: kern/vfs_bio.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_bio.c,v retrieving revision 1.242.2.17 diff -u -r1.242.2.17 vfs_bio.c --- kern/vfs_bio.c 29 Jun 2002 16:38:27 -0000 1.242.2.17 +++ kern/vfs_bio.c 6 Sep 2002 21:41:06 -0000 @@ -442,11 +442,13 @@ { if (bp->b_kvasize) { ++buffreekvacnt; + vm_map_lock(buffer_map); bufspace -= bp->b_kvasize; vm_map_delete(buffer_map, (vm_offset_t) bp->b_kvabase, (vm_offset_t) bp->b_kvabase + bp->b_kvasize ); + vm_map_unlock(buffer_map); bp->b_kvasize = 0; bufspacewakeup(); } @@ -1740,12 +1742,15 @@ bfreekva(bp); + vm_map_lock(buffer_map); + if (vm_map_findspace(buffer_map, vm_map_min(buffer_map), maxsize, &addr)) { /* * Uh oh. Buffer map is to fragmented. We * must defragment the map. */ + vm_map_unlock(buffer_map); ++bufdefragcnt; defrag = 1; bp->b_flags |= B_INVAL; @@ -1762,6 +1767,7 @@ bufspace += bp->b_kvasize; ++bufreusecnt; } + vm_map_unlock(buffer_map); } bp->b_data = bp->b_kvabase; } Index: sys/vmmeter.h =================================================================== RCS file: /home/ncvs/src/sys/sys/vmmeter.h,v retrieving revision 1.21.2.1 diff -u -r1.21.2.1 vmmeter.h --- sys/vmmeter.h 18 Feb 2001 15:41:11 -0000 1.21.2.1 +++ sys/vmmeter.h 8 Sep 2002 01:29:22 -0000 @@ -103,6 +103,8 @@ u_int v_vforkpages; /* number of VM pages affected by vfork() */ u_int v_rforkpages; /* number of VM pages affected by rfork() */ u_int v_kthreadpages; /* number of VM pages affected by fork() by kernel */ + u_int v_intrans_coll; /* intransit map collisions (total) */ + u_int v_intrans_wait; /* intransit map collisions which blocked */ }; #ifdef _KERNEL Index: vm/vm_map.c =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_map.c,v retrieving revision 1.187.2.14 diff -u -r1.187.2.14 vm_map.c --- vm/vm_map.c 2 Jul 2002 20:06:18 -0000 1.187.2.14 +++ vm/vm_map.c 8 Sep 2002 19:38:39 -0000 @@ -146,6 +146,7 @@ static void vm_map_copy_entry __P((vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t)); static void vm_map_split __P((vm_map_entry_t)); +static void vm_map_unclip_range __P((vm_map_t map, vm_map_entry_t start_entry, vm_offset_t start, vm_offset_t end, int flags)); void vm_map_startup() @@ -355,9 +356,13 @@ vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry) { - vm_map_entry_t prev = entry->prev; - vm_map_entry_t next = entry->next; + vm_map_entry_t prev; + vm_map_entry_t next; + if (entry->eflags & MAP_ENTRY_IN_TRANSITION) + panic("vm_map_entry_unlink: attempt to mess with locked entry! %p", entry); + prev = entry->prev; + next = entry->next; next->prev = prev; prev->next = next; map->nentries--; @@ -734,7 +739,8 @@ * * This routine guarentees that the passed entry remains valid (though * possibly extended). When merging, this routine may delete one or - * both neighbors. + * both neighbors. No action is taken on entries which have their + * in-transition flag set. */ void vm_map_simplify_entry(map, entry) @@ -744,8 +750,10 @@ vm_map_entry_t next, prev; vm_size_t prevsize, esize; - if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) + if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) { + ++cnt.v_intrans_coll; return; + } prev = entry->prev; if (prev != &map->header) { @@ -935,6 +943,208 @@ } /* + * vm_map_transition_wait: [ kernel use only ] + * + * Used to block when an in-transition collison occurs. The map + * is unlocked for the sleep and relocked before the return. + */ +static +void +vm_map_transition_wait(vm_map_t map) +{ + vm_map_unlock(map); + tsleep(map, PVM, "vment", 0); + vm_map_lock(map); +} + +/* + * CLIP_CHECK_BACK + * CLIP_CHECK_FWD + * + * When we do blocking operations with the map lock held it is + * possible that a clip might have occured on our in-transit entry, + * requiring an adjustment to the entry in our loop. These macros + * help the pageable and clip_range code deal with the case. The + * conditional costs virtually nothing if no clipping has occured. + */ + +#define CLIP_CHECK_BACK(entry, save_start) \ + do { \ + while (entry->start != save_start) { \ + entry = entry->prev; \ + KASSERT(entry != &map->header, ("bad entry clip")); \ + } \ + } while(0) + +#define CLIP_CHECK_FWD(entry, save_end) \ + do { \ + while (entry->end != save_end) { \ + entry = entry->next; \ + KASSERT(entry != &map->header, ("bad entry clip")); \ + } \ + } while(0) + + +/* + * vm_map_clip_range: [ kernel use only ] + * + * Clip the specified range and return the base entry. The + * range may cover several entries starting at the returned base + * and the first and last entry in the covering sequence will be + * properly clipped to the requested start and end address. + * + * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES + * flag. + * + * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries + * covered by the requested range. + * + * The map must be exclusively locked on entry and will remain locked + * on return. If no range exists or the range contains holes and you + * specified that no holes were allowed, NULL will be returned. This + * routine may temporarily unlock the map in order avoid a deadlock when + * sleeping. + */ +static +vm_map_entry_t +vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) +{ + vm_map_entry_t start_entry; + vm_map_entry_t entry; + + /* + * Locate the entry and effect initial clipping. The in-transition + * case does not occur very often so do not try to optimize it. + */ +again: + if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) + return (NULL); + entry = start_entry; + if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { + entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; + ++cnt.v_intrans_coll; + ++cnt.v_intrans_wait; + vm_map_transition_wait(map); + /* + * entry and/or start_entry may have been clipped while + * we slept, or may have gone away entirely. We have + * to restart from the lookup. + */ + goto again; + } + /* + * Since we hold an exclusive map lock we do not have to restart + * after clipping, even though clipping may block in zalloc. + */ + vm_map_clip_start(map, entry, start); + vm_map_clip_end(map, entry, end); + entry->eflags |= MAP_ENTRY_IN_TRANSITION; + + /* + * Scan entries covered by the range. When working on the next + * entry a restart need only re-loop on the current entry which + * we have already locked, since 'next' may have changed. Also, + * even though entry is safe, it may have been clipped so we + * have to iterate forwards through the clip after sleeping. + */ + while (entry->next != &map->header && entry->next->start < end) { + vm_map_entry_t next = entry->next; + + if (flags & MAP_CLIP_NO_HOLES) { + if (next->start > entry->end) { + vm_map_unclip_range(map, start_entry, + start, entry->end, flags); + return(NULL); + } + } + + if (next->eflags & MAP_ENTRY_IN_TRANSITION) { + vm_offset_t save_end = entry->end; + next->eflags |= MAP_ENTRY_NEEDS_WAKEUP; + ++cnt.v_intrans_coll; + ++cnt.v_intrans_wait; + vm_map_transition_wait(map); + + /* + * clips might have occured while we blocked. + */ + CLIP_CHECK_FWD(entry, save_end); + CLIP_CHECK_BACK(start_entry, start); + continue; + } + /* + * No restart necessary even though clip_end may block, we + * are holding the map lock. + */ + vm_map_clip_end(map, next, end); + next->eflags |= MAP_ENTRY_IN_TRANSITION; + entry = next; + } + if (flags & MAP_CLIP_NO_HOLES) { + if (entry->end != end) { + vm_map_unclip_range(map, start_entry, + start, entry->end, flags); + return(NULL); + } + } + return(start_entry); +} + +/* + * vm_map_unclip_range: [ kernel use only ] + * + * Undo the effect of vm_map_clip_range(). You should pass the same + * flags and the same range that you passed to vm_map_clip_range(). + * This code will clear the in-transition flag on the entries and + * wake up anyone waiting. This code will also simplify the sequence + * and attempt to merge it with entries before and after the sequence. + * + * The map must be locked on entry and will remain locked on return. + * + * Note that you should also pass the start_entry returned by + * vm_map_clip_range(). However, if you block between the two calls + * with the map unlocked please be aware that the start_entry may + * have been clipped and you may need to scan it backwards to find + * the entry corresponding with the original start address. You are + * responsible for this, vm_map_unclip_range() expects the correct + * start_entry to be passed to it and will KASSERT otherwise. + */ +static +void +vm_map_unclip_range( + vm_map_t map, + vm_map_entry_t start_entry, + vm_offset_t start, + vm_offset_t end, + int flags) +{ + vm_map_entry_t entry; + + entry = start_entry; + + KASSERT(entry->start == start, ("unclip_range: illegal base entry")); + while (entry != &map->header && entry->start < end) { + KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION, ("in-transition flag not set during unclip on: %p", entry)); + KASSERT(entry->end <= end, ("unclip_range: tail wasn't clipped")); + entry->eflags &= ~MAP_ENTRY_IN_TRANSITION; + if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) { + entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP; + wakeup(map); + } + entry = entry->next; + } + + /* + * Simplification does not block so there is no restart case. + */ + entry = start_entry; + while (entry != &map->header && entry->start < end) { + vm_map_simplify_entry(map, entry); + entry = entry->next; + } +} + +/* * vm_map_submap: [ kernel use only ] * * Mark the given range as handled by a subordinate map. @@ -968,8 +1178,9 @@ if (vm_map_lookup_entry(map, start, &entry)) { vm_map_clip_start(map, entry, start); - } else + } else { entry = entry->next; + } vm_map_clip_end(map, entry, end); @@ -1279,58 +1490,40 @@ * Implement the semantics of mlock */ int -vm_map_user_pageable(map, start, end, new_pageable) +vm_map_user_pageable(map, start, real_end, new_pageable) vm_map_t map; vm_offset_t start; - vm_offset_t end; + vm_offset_t real_end; boolean_t new_pageable; { vm_map_entry_t entry; vm_map_entry_t start_entry; - vm_offset_t estart; - vm_offset_t eend; - int rv; + vm_offset_t end; + int rv = KERN_SUCCESS; vm_map_lock(map); - VM_MAP_RANGE_CHECK(map, start, end); + VM_MAP_RANGE_CHECK(map, start, real_end); + end = real_end; - if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) { + start_entry = vm_map_clip_range(map, start, end, MAP_CLIP_NO_HOLES); + if (start_entry == NULL) { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } - if (new_pageable) { - + if (new_pageable == 0) { entry = start_entry; - vm_map_clip_start(map, entry, start); - - /* - * Now decrement the wiring count for each region. If a region - * becomes completely unwired, unwire its physical pages and - * mappings. - */ - while ((entry != &map->header) && (entry->start < end)) { - if (entry->eflags & MAP_ENTRY_USER_WIRED) { - vm_map_clip_end(map, entry, end); - entry->eflags &= ~MAP_ENTRY_USER_WIRED; - entry->wired_count--; - if (entry->wired_count == 0) - vm_fault_unwire(map, entry->start, entry->end); - } - vm_map_simplify_entry(map,entry); - entry = entry->next; - } - } else { - - entry = start_entry; - while ((entry != &map->header) && (entry->start < end)) { + vm_offset_t save_start; + vm_offset_t save_end; + /* + * Already user wired or hard wired (trivial cases) + */ if (entry->eflags & MAP_ENTRY_USER_WIRED) { entry = entry->next; continue; } - if (entry->wired_count != 0) { entry->wired_count++; entry->eflags |= MAP_ENTRY_USER_WIRED; @@ -1338,8 +1531,11 @@ continue; } - /* Here on entry being newly wired */ - + /* + * A new wiring requires instantiation of appropriate + * management structures and the faulting in of the + * page. + */ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY; if (copyflag && ((entry->protection & VM_PROT_WRITE) != 0)) { @@ -1359,65 +1555,98 @@ } } - - vm_map_clip_start(map, entry, start); - vm_map_clip_end(map, entry, end); - entry->wired_count++; entry->eflags |= MAP_ENTRY_USER_WIRED; - estart = entry->start; - eend = entry->end; - /* First we need to allow map modifications */ - vm_map_set_recursive(map); - vm_map_lock_downgrade(map); + /* + * Now fault in the area. The map lock needs to be + * manipulated to avoid deadlocks. The in-transition + * flag protects the entries. + */ + save_start = entry->start; + save_end = entry->end; + vm_map_unlock(map); map->timestamp++; - - rv = vm_fault_user_wire(map, entry->start, entry->end); + rv = vm_fault_user_wire(map, save_start, save_end); + vm_map_lock(map); if (rv) { + CLIP_CHECK_BACK(entry, save_start); + for (;;) { + KASSERT(entry->wired_count == 1, ("bad wired_count on entry")); + entry->eflags &= ~MAP_ENTRY_USER_WIRED; + entry->wired_count = 0; + if (entry->end == save_end) + break; + entry = entry->next; + KASSERT(entry != &map->header, ("bad entry clip during backout")); + } + end = save_start; /* unwire the rest */ + break; + } + /* + * note that even though the entry might have been + * clipped, the USER_WIRED flag we set prevents + * duplication so we do not have to do a + * clip check. + */ + entry = entry->next; + } - entry->wired_count--; - entry->eflags &= ~MAP_ENTRY_USER_WIRED; + /* + * If we failed fall through to the unwiring section to + * unwire what we had wired so far. 'end' has already + * been adjusted. + */ + if (rv) + new_pageable = 1; - vm_map_clear_recursive(map); - vm_map_unlock(map); - - /* - * At this point, the map is unlocked, and - * entry might no longer be valid. Use copy - * of entry start value obtained while entry - * was valid. - */ - (void) vm_map_user_pageable(map, start, estart, - TRUE); - return rv; - } - - vm_map_clear_recursive(map); - if (vm_map_lock_upgrade(map)) { - vm_map_lock(map); - if (vm_map_lookup_entry(map, estart, &entry) - == FALSE) { - vm_map_unlock(map); - /* - * vm_fault_user_wire succeded, thus - * the area between start and eend - * is wired and has to be unwired - * here as part of the cleanup. - */ - (void) vm_map_user_pageable(map, - start, - eend, - TRUE); - return (KERN_INVALID_ADDRESS); - } + /* + * start_entry might have been clipped if we unlocked the + * map and blocked. No matter how clipped it has gotten + * there should be a fragment that is on our start boundary. + */ + CLIP_CHECK_BACK(start_entry, start); + } + + /* + * Deal with the unwiring case. + */ + if (new_pageable) { + /* + * This is the unwiring case. We must first ensure that the + * range to be unwired is really wired down. We know there + * are no holes. + */ + entry = start_entry; + while ((entry != &map->header) && (entry->start < end)) { + if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) { + rv = KERN_INVALID_ARGUMENT; + goto done; } - vm_map_simplify_entry(map,entry); + KASSERT(entry->wired_count != 0, ("wired count was 0 with USER_WIRED set! %p", entry)); + entry = entry->next; + } + + /* + * Now decrement the wiring count for each region. If a region + * becomes completely unwired, unwire its physical pages and + * mappings. + */ + while ((entry != &map->header) && (entry->start < end)) { + KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED, ("expected USER_WIRED on entry %p", entry)); + entry->eflags &= ~MAP_ENTRY_USER_WIRED; + entry->wired_count--; + if (entry->wired_count == 0) + vm_fault_unwire(map, entry->start, entry->end); + entry = entry->next; } } +done: + vm_map_unclip_range(map, start_entry, start, real_end, + MAP_CLIP_NO_HOLES); map->timestamp++; vm_map_unlock(map); - return KERN_SUCCESS; + return (rv); } /* @@ -1432,80 +1661,30 @@ * must remain to the map throughout the call. */ int -vm_map_pageable(map, start, end, new_pageable) +vm_map_pageable(map, start, real_end, new_pageable) vm_map_t map; vm_offset_t start; - vm_offset_t end; + vm_offset_t real_end; boolean_t new_pageable; { vm_map_entry_t entry; vm_map_entry_t start_entry; - vm_offset_t failed = 0; - int rv; + vm_offset_t end; + int rv = KERN_SUCCESS; + int s; vm_map_lock(map); + VM_MAP_RANGE_CHECK(map, start, real_end); + end = real_end; - VM_MAP_RANGE_CHECK(map, start, end); - - /* - * Only one pageability change may take place at one time, since - * vm_fault assumes it will be called only once for each - * wiring/unwiring. Therefore, we have to make sure we're actually - * changing the pageability for the entire region. We do so before - * making any changes. - */ - - if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) { + start_entry = vm_map_clip_range(map, start, end, MAP_CLIP_NO_HOLES); + if (start_entry == NULL) { vm_map_unlock(map); return (KERN_INVALID_ADDRESS); } - entry = start_entry; - - /* - * Actions are rather different for wiring and unwiring, so we have - * two separate cases. - */ - - if (new_pageable) { - - vm_map_clip_start(map, entry, start); - - /* - * Unwiring. First ensure that the range to be unwired is - * really wired down and that there are no holes. - */ - while ((entry != &map->header) && (entry->start < end)) { - - if (entry->wired_count == 0 || - (entry->end < end && - (entry->next == &map->header || - entry->next->start > entry->end))) { - vm_map_unlock(map); - return (KERN_INVALID_ARGUMENT); - } - entry = entry->next; - } - - /* - * Now decrement the wiring count for each region. If a region - * becomes completely unwired, unwire its physical pages and - * mappings. - */ - entry = start_entry; - while ((entry != &map->header) && (entry->start < end)) { - vm_map_clip_end(map, entry, end); - - entry->wired_count--; - if (entry->wired_count == 0) - vm_fault_unwire(map, entry->start, entry->end); - - vm_map_simplify_entry(map, entry); - - entry = entry->next; - } - } else { + if (new_pageable == 0) { /* - * Wiring. We must do this in two passes: + * Wiring. * * 1. Holding the write lock, we create any shadow or zero-fill * objects that need to be created. Then we clip each map @@ -1517,9 +1696,9 @@ * fault in the pages for any newly wired area (wired_count is * 1). * - * Downgrading to a read lock for vm_fault_wire avoids a possible - * deadlock with another process that may have faulted on one - * of the pages to be wired (it would mark the page busy, + * Downgrading to a read lock for vm_fault_wire avoids a + * possible deadlock with another process that may have faulted + * on one of the pages to be wired (it would mark the page busy, * blocking us, then in turn block on the map lock that we * hold). Because of problems in the recursive lock package, * we cannot upgrade to a write lock in vm_map_lookup. Thus, @@ -1529,62 +1708,44 @@ * change. */ - /* - * Pass 1. - */ + entry = start_entry; while ((entry != &map->header) && (entry->start < end)) { - if (entry->wired_count == 0) { - - /* - * Perform actions of vm_map_lookup that need - * the write lock on the map: create a shadow - * object for a copy-on-write region, or an - * object for a zero-fill region. - * - * We don't have to do this for entries that - * point to sub maps, because we won't - * hold the lock on the sub map. - */ - if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { - int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY; - if (copyflag && - ((entry->protection & VM_PROT_WRITE) != 0)) { - - vm_object_shadow(&entry->object.vm_object, - &entry->offset, - atop(entry->end - entry->start)); - entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; - } else if (entry->object.vm_object == NULL && - !map->system_map) { - entry->object.vm_object = - vm_object_allocate(OBJT_DEFAULT, - atop(entry->end - entry->start)); - entry->offset = (vm_offset_t) 0; - } - } + /* + * Trivial case if the entry is already wired + */ + if (entry->wired_count) { + entry->wired_count++; + entry = entry->next; + continue; } - vm_map_clip_start(map, entry, start); - vm_map_clip_end(map, entry, end); - entry->wired_count++; /* - * Check for holes + * The entry is being newly wired, we have to setup + * appropriate management structures. A shadow + * object is required for a copy-on-write region, + * or a normal object for a zero-fill region. We + * do not have to do this for entries that point to sub + * maps because we won't hold the lock on the sub map. */ - if (entry->end < end && - (entry->next == &map->header || - entry->next->start > entry->end)) { - /* - * Found one. Object creation actions do not - * need to be undone, but the wired counts - * need to be restored. - */ - while (entry != &map->header && entry->end > start) { - entry->wired_count--; - entry = entry->prev; + if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { + int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY; + if (copyflag && + ((entry->protection & VM_PROT_WRITE) != 0)) { + + vm_object_shadow(&entry->object.vm_object, + &entry->offset, + atop(entry->end - entry->start)); + entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; + } else if (entry->object.vm_object == NULL && + !map->system_map) { + entry->object.vm_object = + vm_object_allocate(OBJT_DEFAULT, + atop(entry->end - entry->start)); + entry->offset = (vm_offset_t) 0; } - vm_map_unlock(map); - return (KERN_INVALID_ARGUMENT); } + + entry->wired_count++; entry = entry->next; } @@ -1595,22 +1756,22 @@ /* * HACK HACK HACK HACK * - * If we are wiring in the kernel map or a submap of it, - * unlock the map to avoid deadlocks. We trust that the - * kernel is well-behaved, and therefore will not do - * anything destructive to this region of the map while - * we have it unlocked. We cannot trust user processes - * to do the same. + * Unlock the map to avoid deadlocks. The in-transit flag + * protects us from most changes but note that + * clipping may still occur. To prevent clipping from + * occuring after the unlock, except for when we are + * blocking in vm_fault_wire, we must run at splvm(). + * Otherwise our accesses to entry->start and entry->end + * could be corrupted. We have to set splvm() prior to + * unlocking so start_entry does not change out from + * under us at the very beginning of the loop. * * HACK HACK HACK HACK */ - if (vm_map_pmap(map) == kernel_pmap) { - vm_map_unlock(map); /* trust me ... */ - } else { - vm_map_lock_downgrade(map); - } - rv = 0; + s = splvm(); + vm_map_unlock(map); + entry = start_entry; while (entry != &map->header && entry->start < end) { /* @@ -1618,50 +1779,87 @@ * what has been done. We decrement the wiring count * for those pages which have not yet been wired (now) * and unwire those that have (later). - * - * XXX this violates the locking protocol on the map, - * needs to be fixed. */ - if (rv) - entry->wired_count--; - else if (entry->wired_count == 1) { + vm_offset_t save_start = entry->start; + vm_offset_t save_end = entry->end; + + if (entry->wired_count == 1) rv = vm_fault_wire(map, entry->start, entry->end); - if (rv) { - failed = entry->start; - entry->wired_count--; + if (rv) { + CLIP_CHECK_BACK(entry, save_start); + for (;;) { + KASSERT(entry->wired_count == 1, ("wired_count changed unexpectedly")); + entry->wired_count = 0; + if (entry->end == save_end) + break; + entry = entry->next; + KASSERT(entry != &map->header, ("bad entry clip during backout")); } + end = save_start; + break; } + CLIP_CHECK_FWD(entry, save_end); entry = entry->next; } + splx(s); - if (vm_map_pmap(map) == kernel_pmap) { - vm_map_lock(map); - } - if (rv) { - vm_map_unlock(map); - (void) vm_map_pageable(map, start, failed, TRUE); - return (rv); - } /* - * An exclusive lock on the map is needed in order to call - * vm_map_simplify_entry(). If the current lock on the map - * is only a shared lock, an upgrade is needed. + * relock. start_entry is still IN_TRANSITION and must + * still exist, but may have been clipped (handled just + * below). */ - if (vm_map_pmap(map) != kernel_pmap && - vm_map_lock_upgrade(map)) { - vm_map_lock(map); - if (vm_map_lookup_entry(map, start, &start_entry) == - FALSE) { - vm_map_unlock(map); - return KERN_SUCCESS; + vm_map_lock(map); + + /* + * If a failure occured undo everything by falling through + * to the unwiring code. 'end' has already been adjusted + * appropriately. + */ + if (rv) + new_pageable = 1; + + /* + * start_entry might have been clipped if we unlocked the + * map and blocked. No matter how clipped it has gotten + * there should be a fragment that is on our start boundary. + */ + CLIP_CHECK_BACK(start_entry, start); + } + + if (new_pageable) { + /* + * This is the unwiring case. We must first ensure that the + * range to be unwired is really wired down. We know there + * are no holes. + */ + entry = start_entry; + while ((entry != &map->header) && (entry->start < end)) { + if (entry->wired_count == 0) { + rv = KERN_INVALID_ARGUMENT; + goto done; } + entry = entry->next; } - vm_map_simplify_entry(map, start_entry); - } + /* + * Now decrement the wiring count for each region. If a region + * becomes completely unwired, unwire its physical pages and + * mappings. + */ + entry = start_entry; + while ((entry != &map->header) && (entry->start < end)) { + entry->wired_count--; + if (entry->wired_count == 0) + vm_fault_unwire(map, entry->start, entry->end); + entry = entry->next; + } + } +done: + vm_map_unclip_range(map, start_entry, start, real_end, + MAP_CLIP_NO_HOLES); + map->timestamp++; vm_map_unlock(map); - - return (KERN_SUCCESS); + return (rv); } /* @@ -1849,6 +2047,7 @@ * Find the start of the region, and clip it */ +again: if (!vm_map_lookup_entry(map, start, &first_entry)) entry = first_entry->next; else { @@ -1880,6 +2079,22 @@ vm_offset_t s, e; vm_pindex_t offidxstart, offidxend, count; + /* + * If we hit an in-transition entry we have to sleep and + * retry. It's easier (and not really slower) to just retry + * since this case occurs so rarely and the hint is already + * pointing at the right place. We have to reset the + * start offset so as not to accidently delete an entry + * another process just created in vacated space. + */ + if (entry->eflags & MAP_ENTRY_IN_TRANSITION) { + entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; + start = entry->start; + ++cnt.v_intrans_coll; + ++cnt.v_intrans_wait; + vm_map_transition_wait(map); + goto again; + } vm_map_clip_end(map, entry, end); s = entry->start; @@ -2377,21 +2592,22 @@ vm_map_t map = &vm->vm_map; vm_offset_t end; int grow_amount; - int rv; + int rv = KERN_SUCCESS; int is_procstack; + int use_read_lock = 1; + Retry: - vm_map_lock_read(map); + if (use_read_lock) + vm_map_lock_read(map); + else + vm_map_lock(map); /* If addr is already in the entry range, no need to grow.*/ - if (vm_map_lookup_entry(map, addr, &prev_entry)) { - vm_map_unlock_read(map); - return (KERN_SUCCESS); - } + if (vm_map_lookup_entry(map, addr, &prev_entry)) + goto done; - if ((stack_entry = prev_entry->next) == &map->header) { - vm_map_unlock_read(map); - return (KERN_SUCCESS); - } + if ((stack_entry = prev_entry->next) == &map->header) + goto done; if (prev_entry == &map->header) end = stack_entry->start - stack_entry->avail_ssize; else @@ -2407,15 +2623,14 @@ if (stack_entry->avail_ssize < 1 || addr >= stack_entry->start || addr < stack_entry->start - stack_entry->avail_ssize) { - vm_map_unlock_read(map); - return (KERN_SUCCESS); + goto done; } /* Find the minimum grow amount */ grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE); if (grow_amount > stack_entry->avail_ssize) { - vm_map_unlock_read(map); - return (KERN_NO_SPACE); + rv = KERN_NO_SPACE; + goto done; } /* If there is no longer enough space between the entries @@ -2428,13 +2643,14 @@ * might have intended by limiting the stack size. */ if (grow_amount > stack_entry->start - end) { - if (vm_map_lock_upgrade(map)) + if (use_read_lock && vm_map_lock_upgrade(map)) { + use_read_lock = 0; goto Retry; - + } + use_read_lock = 0; stack_entry->avail_ssize = stack_entry->start - end; - - vm_map_unlock(map); - return (KERN_NO_SPACE); + rv = KERN_NO_SPACE; + goto done; } is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr; @@ -2444,8 +2660,8 @@ */ if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > p->p_rlimit[RLIMIT_STACK].rlim_cur)) { - vm_map_unlock_read(map); - return (KERN_NO_SPACE); + rv = KERN_NO_SPACE; + goto done; } /* Round up the grow amount modulo SGROWSIZ */ @@ -2462,12 +2678,15 @@ /* If we would blow our VMEM resource limit, no go */ if (map->size + grow_amount > curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) { - vm_map_unlock_read(map); - return (KERN_NO_SPACE); + rv = KERN_NO_SPACE; + goto done; } - if (vm_map_lock_upgrade(map)) + if (use_read_lock && vm_map_lock_upgrade(map)) { + use_read_lock = 0; goto Retry; + } + use_read_lock = 0; /* Get the preliminary new entry start value */ addr = stack_entry->start - grow_amount; @@ -2503,9 +2722,12 @@ } } - vm_map_unlock(map); +done: + if (use_read_lock) + vm_map_unlock_read(map); + else + vm_map_unlock(map); return (rv); - } /* @@ -2593,28 +2815,20 @@ vm_map_t map = *var_map; vm_prot_t prot; vm_prot_t fault_type = fault_typea; + int use_read_lock = 1; + int rv = KERN_SUCCESS; -RetryLookup:; - - /* - * Lookup the faulting address. - */ - - vm_map_lock_read(map); - -#define RETURN(why) \ - { \ - vm_map_unlock_read(map); \ - return(why); \ - } +RetryLookup: + if (use_read_lock) + vm_map_lock_read(map); + else + vm_map_lock(map); /* * If the map has an interesting hint, try it before calling full * blown lookup routine. */ - entry = map->hint; - *out_entry = entry; if ((entry == &map->header) || @@ -2625,8 +2839,10 @@ * Entry was either not a valid hint, or the vaddr was not * contained in the entry, so do a full lookup. */ - if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) - RETURN(KERN_INVALID_ADDRESS); + if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) { + rv = KERN_INVALID_ADDRESS; + goto done; + } entry = tmp_entry; *out_entry = entry; @@ -2640,7 +2856,11 @@ vm_map_t old_map = map; *var_map = map = entry->object.sub_map; - vm_map_unlock_read(old_map); + if (use_read_lock) + vm_map_unlock_read(old_map); + else + vm_map_unlock(old_map); + use_read_lock = 1; goto RetryLookup; } @@ -2658,14 +2878,16 @@ fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE); if ((fault_type & prot) != fault_type) { - RETURN(KERN_PROTECTION_FAILURE); + rv = KERN_PROTECTION_FAILURE; + goto done; } if ((entry->eflags & MAP_ENTRY_USER_WIRED) && (entry->eflags & MAP_ENTRY_COW) && (fault_type & VM_PROT_WRITE) && (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) { - RETURN(KERN_PROTECTION_FAILURE); + rv = KERN_PROTECTION_FAILURE; + goto done; } /* @@ -2698,8 +2920,11 @@ * object. */ - if (vm_map_lock_upgrade(map)) + if (use_read_lock && vm_map_lock_upgrade(map)) { + use_read_lock = 0; goto RetryLookup; + } + use_read_lock = 0; vm_object_shadow( &entry->object.vm_object, @@ -2707,7 +2932,6 @@ atop(entry->end - entry->start)); entry->eflags &= ~MAP_ENTRY_NEEDS_COPY; - vm_map_lock_downgrade(map); } else { /* * We're attempting to read a copy-on-write page -- @@ -2723,13 +2947,14 @@ */ if (entry->object.vm_object == NULL && !map->system_map) { - if (vm_map_lock_upgrade(map)) + if (use_read_lock && vm_map_lock_upgrade(map)) { + use_read_lock = 0; goto RetryLookup; - + } + use_read_lock = 0; entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT, atop(entry->end - entry->start)); entry->offset = 0; - vm_map_lock_downgrade(map); } /* @@ -2741,13 +2966,21 @@ *object = entry->object.vm_object; /* - * Return whether this is the only map sharing this data. + * Return whether this is the only map sharing this data. On + * success we return with a read lock held on the map. On failure + * we return with the map unlocked. */ - *out_prot = prot; - return (KERN_SUCCESS); - -#undef RETURN +done: + if (rv == KERN_SUCCESS) { + if (use_read_lock == 0) + vm_map_lock_downgrade(map); + } else if (use_read_lock) { + vm_map_unlock_read(map); + } else { + vm_map_unlock(map); + } + return (rv); } /* Index: vm/vm_map.h =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_map.h,v retrieving revision 1.54.2.3 diff -u -r1.54.2.3 vm_map.h --- vm/vm_map.h 3 Nov 2001 00:59:15 -0000 1.54.2.3 +++ vm/vm_map.h 8 Sep 2002 05:58:16 -0000 @@ -128,7 +128,14 @@ #define MAP_ENTRY_BEHAV_MASK 0x00C0 +#define MAP_ENTRY_IN_TRANSITION 0x0100 /* entry being changed */ +#define MAP_ENTRY_NEEDS_WAKEUP 0x0200 /* waiter's in transition */ #define MAP_ENTRY_NOCOREDUMP 0x0400 /* don't include in a core */ + +/* + * flags for vm_map_[un]clip_range() + */ +#define MAP_CLIP_NO_HOLES 0x0001 static __inline u_char vm_map_entry_behavior(struct vm_map_entry *entry) Index: vm/vm_meter.c =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_meter.c,v retrieving revision 1.34.2.6 diff -u -r1.34.2.6 vm_meter.c --- vm/vm_meter.c 14 Nov 2001 17:22:53 -0000 1.34.2.6 +++ vm/vm_meter.c 8 Sep 2002 01:30:08 -0000 @@ -71,6 +71,10 @@ CTLFLAG_RW, &cnt.v_pageout_free_min, 0, ""); SYSCTL_UINT(_vm, OID_AUTO, v_free_severe, CTLFLAG_RW, &cnt.v_free_severe, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, v_intrans_coll, + CTLFLAG_RW, &cnt.v_intrans_coll, 0, ""); +SYSCTL_UINT(_vm, OID_AUTO, v_intrans_wait, + CTLFLAG_RW, &cnt.v_intrans_wait, 0, ""); SYSCTL_STRUCT(_vm, VM_LOADAVG, loadavg, CTLFLAG_RD, &averunnable, loadavg, "Machine loadaverage history"); Index: vm/vm_zone.c =================================================================== RCS file: /home/ncvs/src/sys/vm/Attic/vm_zone.c,v retrieving revision 1.30.2.5 diff -u -r1.30.2.5 vm_zone.c --- vm/vm_zone.c 12 Aug 2002 23:39:08 -0000 1.30.2.5 +++ vm/vm_zone.c 16 Aug 2002 21:55:56 -0000 @@ -56,12 +56,21 @@ zerror(ZONE_ERROR_INVALID); #endif - if (z->zfreecnt <= z->zfreemin) - return _zget(z); + if (z->zfreecnt <= z->zfreemin) { + item = _zget(z); + /* + * PANICFAIL allows the caller to assume that the zalloc() + * will always succeed. If it doesn't, we panic here. + */ + if (item == NULL && (z->zflags & ZONE_PANICFAIL)) + panic("zalloc(%s) failed", z->zname); + return(item); + } item = z->zitems; z->zitems = ((void **) item)[0]; #ifdef INVARIANTS + KASSERT(item != NULL, ("zitems unexpectedly NULL")); if (((void **) item)[1] != (void *) ZENTRY_FREE) zerror(ZONE_ERROR_NOTFREE); ((void **) item)[1] = 0;