diff --git a/sys/vfs/hammer/hammer.h b/sys/vfs/hammer/hammer.h index 415a4b5..6c698fb 100644 --- a/sys/vfs/hammer/hammer.h +++ b/sys/vfs/hammer/hammer.h @@ -295,6 +295,7 @@ struct hammer_inode { struct hammer_btree_leaf_elm ino_leaf; /* in-memory cache */ struct hammer_inode_data ino_data; /* in-memory cache */ struct hammer_rec_rb_tree rec_tree; /* in-memory cache */ + int rec_generation; struct hammer_node_cache cache[2]; /* search initiate cache */ /* @@ -1060,8 +1061,6 @@ hammer_off_t hammer_blockmap_alloc(hammer_transaction_t trans, int zone, int bytes, int *errorp); hammer_reserve_t hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes, hammer_off_t *zone_offp, int *errorp); -void hammer_blockmap_reserve_undo(hammer_mount_t hmp, hammer_reserve_t resv, - hammer_off_t zone_offset, int bytes); void hammer_blockmap_reserve_complete(hammer_mount_t hmp, hammer_reserve_t resv); void hammer_reserve_clrdelay(hammer_mount_t hmp, hammer_reserve_t resv); diff --git a/sys/vfs/hammer/hammer_blockmap.c b/sys/vfs/hammer/hammer_blockmap.c index d88d11f..069fff6 100644 --- a/sys/vfs/hammer/hammer_blockmap.c +++ b/sys/vfs/hammer/hammer_blockmap.c @@ -565,19 +565,6 @@ failed: return(resv); } -#if 0 -/* - * Backend function - undo a portion of a reservation. - */ -void -hammer_blockmap_reserve_undo(hammer_mount_t hmp, hammer_reserve_t resv, - hammer_off_t zone_offset, int bytes) -{ - resv->bytes_freed += bytes; -} - -#endif - /* * Dereference a reservation structure. Upon the final release the * underlying big-block is checked and if it is entirely free we delete diff --git a/sys/vfs/hammer/hammer_btree.c b/sys/vfs/hammer/hammer_btree.c index e51f7f7..6cbfaa8 100644 --- a/sys/vfs/hammer/hammer_btree.c +++ b/sys/vfs/hammer/hammer_btree.c @@ -614,6 +614,10 @@ hammer_btree_lookup(hammer_cursor_t cursor) * Execute the logic required to start an iteration. The first record * located within the specified range is returned and iteration control * flags are adjusted for successive hammer_btree_iterate() calls. + * + * Set ATEDISK so a low-level caller can call btree_first/btree_iterate + * in a loop without worrying about it. Higher-level merged searches will + * adjust the flag appropriately. */ int hammer_btree_first(hammer_cursor_t cursor) @@ -634,6 +638,10 @@ hammer_btree_first(hammer_cursor_t cursor) * * Set ATEDISK when iterating backwards to skip the current entry, * which after an ENOENT lookup will be pointing beyond our end point. + * + * Set ATEDISK so a low-level caller can call btree_last/btree_iterate_reverse + * in a loop without worrying about it. Higher-level merged searches will + * adjust the flag appropriately. */ int hammer_btree_last(hammer_cursor_t cursor) @@ -2154,16 +2162,22 @@ btree_remove(hammer_cursor_t cursor) hammer_flush_node(node); hammer_delete_node(cursor->trans, node); } else { - kprintf("Warning: BTREE_REMOVE: Defering " - "parent removal1 @ %016llx, skipping\n", - node->node_offset); + /* + * Defer parent removal because we could not + * get the lock, just let the leaf remain + * empty. + */ + /**/ } hammer_unlock(&node->lock); hammer_rel_node(node); } else { - kprintf("Warning: BTREE_REMOVE: Defering parent " - "removal2 @ %016llx, skipping\n", - node->node_offset); + /* + * Defer parent removal because we could not + * get the lock, just let the leaf remain + * empty. + */ + /**/ } } else { KKASSERT(parent->ondisk->count > 1); diff --git a/sys/vfs/hammer/hammer_cursor.h b/sys/vfs/hammer/hammer_cursor.h index 332fa4a..20e9721 100644 --- a/sys/vfs/hammer/hammer_cursor.h +++ b/sys/vfs/hammer/hammer_cursor.h @@ -107,6 +107,7 @@ struct hammer_cursor { * Iteration and extraction control variables */ int flags; + int rec_generation; /* * Merged in-memory/on-disk iterations also use these fields. @@ -138,6 +139,7 @@ typedef struct hammer_cursor *hammer_cursor_t; #define HAMMER_CURSOR_REBLOCKING 0x00020000 #define HAMMER_CURSOR_TRACKED 0x00040000 #define HAMMER_CURSOR_TRACKED_RIPOUT 0x00080000 +#define HAMMER_CURSOR_LASTWASMEM 0x00100000 /* hammer_ip_next logic */ /* * Flags we can clear when reusing a cursor (we can clear all of them) diff --git a/sys/vfs/hammer/hammer_inode.c b/sys/vfs/hammer/hammer_inode.c index 73c75de..3998b39 100644 --- a/sys/vfs/hammer/hammer_inode.c +++ b/sys/vfs/hammer/hammer_inode.c @@ -811,6 +811,9 @@ retry: cursor.asof = HAMMER_MAX_TID; cursor.flags |= HAMMER_CURSOR_ASOF; + /* + * Replace any in-memory version of the record. + */ error = hammer_ip_lookup(&cursor); if (error == 0 && hammer_cursor_inmem(&cursor)) { record = cursor.iprec; @@ -824,6 +827,11 @@ retry: error = 0; } } + + /* + * Allocate replacement general record. The backend flush will + * delete any on-disk version of the record. + */ if (error == 0 || error == ENOENT) { record = hammer_alloc_mem_record(ip, sizeof(pfsm->pfsd)); record->type = HAMMER_MEM_RECORD_GENERAL; @@ -1046,12 +1054,13 @@ retry: } /* - * The record isn't managed by the inode's record tree, - * destroy it whether we succeed or fail. + * Note: The record was never on the inode's record tree + * so just wave our hands importantly and destroy it. */ + record->flags |= HAMMER_RECF_COMMITTED; record->flags &= ~HAMMER_RECF_INTERLOCK_BE; - record->flags |= HAMMER_RECF_DELETED_FE | HAMMER_RECF_COMMITTED; record->flush_state = HAMMER_FST_IDLE; + ++ip->rec_generation; hammer_rel_mem_record(record); /* @@ -1278,8 +1287,9 @@ hammer_destroy_inode_callback(struct hammer_inode *ip, void *data __unused) KKASSERT(rec->lock.refs == 1); rec->flush_state = HAMMER_FST_IDLE; rec->flush_group = NULL; - rec->flags |= HAMMER_RECF_DELETED_FE; - rec->flags |= HAMMER_RECF_DELETED_BE; + rec->flags |= HAMMER_RECF_DELETED_FE; /* wave hands */ + rec->flags |= HAMMER_RECF_DELETED_BE; /* wave hands */ + ++ip->rec_generation; hammer_rel_mem_record(rec); } ip->flags &= ~HAMMER_INODE_MODMASK; @@ -1836,16 +1846,20 @@ hammer_setup_child_callback(hammer_record_t rec, void *data) int r; /* - * Deleted records are ignored. Note that the flush detects deleted - * front-end records at multiple points to deal with races. This is - * just the first line of defense. The only time DELETED_FE cannot - * be set is when HAMMER_RECF_INTERLOCK_BE is set. + * Records deleted or committed by the backend are ignored. + * Note that the flush detects deleted frontend records at + * multiple points to deal with races. This is just the first + * line of defense. The only time HAMMER_RECF_DELETED_FE cannot + * be set is when HAMMER_RECF_INTERLOCK_BE is set, because it + * messes up link-count calculations. * - * Don't get confused between record deletion and, say, directory - * entry deletion. The deletion of a directory entry that is on - * the media has nothing to do with the record deletion flags. + * NOTE: Don't get confused between record deletion and, say, + * directory entry deletion. The deletion of a directory entry + * which is on-media has nothing to do with the record deletion + * flags. */ - if (rec->flags & (HAMMER_RECF_DELETED_FE|HAMMER_RECF_DELETED_BE)) { + if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | + HAMMER_RECF_COMMITTED)) { if (rec->flush_state == HAMMER_FST_FLUSH) { KKASSERT(rec->flush_group == rec->ip->flush_group); r = 1; @@ -2230,9 +2244,9 @@ hammer_sync_record_callback(hammer_record_t record, void *data) record->flags |= HAMMER_RECF_INTERLOCK_BE; /* - * The backend may have already disposed of the record. + * The backend has already disposed of the record. */ - if (record->flags & HAMMER_RECF_DELETED_BE) { + if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) { error = 0; goto done; } @@ -2256,8 +2270,13 @@ hammer_sync_record_callback(hammer_record_t record, void *data) */ /* fall through */ case HAMMER_MEM_RECORD_GENERAL: - record->flags |= HAMMER_RECF_DELETED_FE; + /* + * Set deleted-by-backend flag. Do not set the + * backend committed flag, because we are throwing + * the record away. + */ record->flags |= HAMMER_RECF_DELETED_BE; + ++record->ip->rec_generation; error = 0; goto done; case HAMMER_MEM_RECORD_ADD: @@ -2295,10 +2314,19 @@ hammer_sync_record_callback(hammer_record_t record, void *data) */ if (record->flags & HAMMER_RECF_DELETED_FE) { if (record->type == HAMMER_MEM_RECORD_ADD) { + /* + * Convert a front-end deleted directory-add to + * a directory-delete entry later. + */ record->flags |= HAMMER_RECF_CONVERT_DELETE; } else { + /* + * Dispose of the record (race case). Mark as + * deleted by backend (and not committed). + */ KKASSERT(record->type != HAMMER_MEM_RECORD_DEL); record->flags |= HAMMER_RECF_DELETED_BE; + ++record->ip->rec_generation; error = 0; goto done; } @@ -2308,9 +2336,10 @@ hammer_sync_record_callback(hammer_record_t record, void *data) * Assign the create_tid for new records. Deletions already * have the record's entire key properly set up. */ - if (record->type != HAMMER_MEM_RECORD_DEL) + if (record->type != HAMMER_MEM_RECORD_DEL) { record->leaf.base.create_tid = trans->tid; record->leaf.create_ts = trans->time32; + } for (;;) { error = hammer_ip_sync_record_cursor(cursor, record); if (error != EDEADLK) @@ -2402,7 +2431,10 @@ hammer_sync_inode(hammer_transaction_t trans, hammer_inode_t ip) } } else if ((depend->flags & HAMMER_RECF_DELETED_FE) == 0) { /* - * Not part of our flush group + * Not part of our flush group and not deleted by + * the front-end, adjust the link count synced to + * the media (undo what the frontend did when it + * queued the record). */ KKASSERT((depend->flags & HAMMER_RECF_DELETED_BE) == 0); switch(depend->type) { @@ -2606,8 +2638,8 @@ defer_buffer_flush: hammer_record_t record = RB_ROOT(&ip->rec_tree); hammer_ref(&record->lock); KKASSERT(record->lock.refs == 1); - record->flags |= HAMMER_RECF_DELETED_FE; record->flags |= HAMMER_RECF_DELETED_BE; + ++record->ip->rec_generation; hammer_rel_mem_record(record); } break; diff --git a/sys/vfs/hammer/hammer_io.c b/sys/vfs/hammer/hammer_io.c index 3fe2fea..bcd6223 100644 --- a/sys/vfs/hammer/hammer_io.c +++ b/sys/vfs/hammer/hammer_io.c @@ -1252,7 +1252,8 @@ hammer_io_direct_write(hammer_mount_t hmp, hammer_record_t record, KKASSERT(error == 0); } else { /* - * Major suckage occured. + * Major suckage occured. Also note: The record was never added + * to the tree so we do not have to worry about the backend. */ kprintf("hammer_direct_write: failed @ %016llx\n", leaf->data_offset); diff --git a/sys/vfs/hammer/hammer_object.c b/sys/vfs/hammer/hammer_object.c index aca7a29..4d89485 100644 --- a/sys/vfs/hammer/hammer_object.c +++ b/sys/vfs/hammer/hammer_object.c @@ -37,7 +37,7 @@ #include "hammer.h" static int hammer_mem_lookup(hammer_cursor_t cursor); -static int hammer_mem_first(hammer_cursor_t cursor); +static void hammer_mem_first(hammer_cursor_t cursor); static int hammer_frontend_trunc_callback(hammer_record_t record, void *data __unused); static int hammer_bulk_scan_callback(hammer_record_t record, void *data); @@ -72,7 +72,10 @@ hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) return(1); /* - * Never match against an item deleted by the front-end. + * For search & insertion purposes records deleted by the + * frontend or deleted/committed by the backend are silently + * ignored. Otherwise pipelined insertions will get messed + * up. * * rec1 is greater then rec2 if rec1 is marked deleted. * rec1 is less then rec2 if rec2 is marked deleted. @@ -80,10 +83,14 @@ hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) * Multiple deleted records may be present, do not return 0 * if both are marked deleted. */ - if (rec1->flags & HAMMER_RECF_DELETED_FE) + if (rec1->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | + HAMMER_RECF_COMMITTED)) { return(1); - if (rec2->flags & HAMMER_RECF_DELETED_FE) + } + if (rec2->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | + HAMMER_RECF_COMMITTED)) { return(-1); + } return(0); } @@ -105,11 +112,15 @@ hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec) return(2); /* - * Never match against an item deleted by the front-end. + * Never match against an item deleted by the frontend + * or backend, or committed by the backend. + * * elm is less then rec if rec is marked deleted. */ - if (rec->flags & HAMMER_RECF_DELETED_FE) + if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | + HAMMER_RECF_COMMITTED)) { return(-1); + } return(0); } @@ -295,11 +306,11 @@ hammer_flush_record_done(hammer_record_t record, int error) KKASSERT(record->flush_state == HAMMER_FST_FLUSH); KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); + /* + * If an error occured, the backend was unable to sync the + * record to its media. Leave the record intact. + */ if (error) { - /* - * An error occured, the backend was unable to sync the - * record to its media. Leave the record intact. - */ hammer_critical_error(record->ip->hmp, record->ip, error, "while flushing record"); } @@ -307,7 +318,11 @@ hammer_flush_record_done(hammer_record_t record, int error) --record->flush_group->refs; record->flush_group = NULL; - if (record->flags & HAMMER_RECF_DELETED_BE) { + /* + * Adjust the flush state and dependancy based on success or + * failure. + */ + if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) { if ((target_ip = record->target_ip) != NULL) { TAILQ_REMOVE(&target_ip->target_list, record, target_entry); @@ -325,6 +340,10 @@ hammer_flush_record_done(hammer_record_t record, int error) } } record->flags &= ~HAMMER_RECF_INTERLOCK_BE; + + /* + * Cleanup + */ if (record->flags & HAMMER_RECF_WANTED) { record->flags &= ~HAMMER_RECF_WANTED; wakeup(record); @@ -361,9 +380,12 @@ hammer_rel_mem_record(struct hammer_record *record) /* * Upon release of the last reference a record marked deleted + * by the front or backend, or committed by the backend, * is destroyed. */ - if (record->flags & HAMMER_RECF_DELETED_FE) { + if (record->flags & (HAMMER_RECF_DELETED_FE | + HAMMER_RECF_DELETED_BE | + HAMMER_RECF_COMMITTED)) { KKASSERT(ip->lock.refs > 0); KKASSERT(record->flush_state != HAMMER_FST_FLUSH); @@ -422,19 +444,16 @@ hammer_rel_mem_record(struct hammer_record *record) } /* - * Release the reservation. If the record was not - * committed return the reservation before - * releasing it. + * Release the reservation. + * + * If the record was not committed we can theoretically + * undo the reservation. However, doing so might + * create weird edge cases with the ordering of + * direct writes because the related buffer cache + * elements are per-vnode. So we don't try. */ if ((resv = record->resv) != NULL) { -#if 0 - if ((record->flags & HAMMER_RECF_COMMITTED) == 0) { - hammer_blockmap_reserve_undo( - hmp, resv, - record->leaf.data_offset, - record->leaf.data_len); - } -#endif + /* XXX undo leaf.data_offset,leaf.data_len */ hammer_blockmap_reserve_complete(hmp, resv); record->resv = NULL; } @@ -447,10 +466,14 @@ hammer_rel_mem_record(struct hammer_record *record) /* * Record visibility depends on whether the record is being accessed by - * the backend or the frontend. + * the backend or the frontend. Backend tests ignore the frontend delete + * flag. Frontend tests do NOT ignore the backend delete/commit flags and + * must also check for commit races. * * Return non-zero if the record is visible, zero if it isn't or if it is - * deleted. + * deleted. Returns 0 if the record has been comitted (unless the special + * delete-visibility flag is set). A committed record must be located + * via the media B-Tree. Returns non-zero if the record is good. * * If HAMMER_CURSOR_DELETE_VISIBILITY is set we allow deleted memory * records to be returned. This is so pending deletions are detected @@ -464,11 +487,16 @@ hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record) if (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) return(1); if (cursor->flags & HAMMER_CURSOR_BACKEND) { - if (record->flags & HAMMER_RECF_DELETED_BE) + if (record->flags & (HAMMER_RECF_DELETED_BE | + HAMMER_RECF_COMMITTED)) { return(0); + } } else { - if (record->flags & HAMMER_RECF_DELETED_FE) + if (record->flags & (HAMMER_RECF_DELETED_FE | + HAMMER_RECF_DELETED_BE | + HAMMER_RECF_COMMITTED)) { return(0); + } } return(1); } @@ -494,7 +522,7 @@ hammer_rec_scan_callback(hammer_record_t rec, void *data) KKASSERT(cursor->iprec == NULL); /* - * Skip if the record was marked deleted. + * Skip if the record was marked deleted or committed. */ if (hammer_ip_iterate_mem_good(cursor, rec) == 0) return(0); @@ -518,7 +546,8 @@ hammer_rec_scan_callback(hammer_record_t rec, void *data) hammer_ref(&rec->lock); /* - * The record may have been deleted while we were blocked. + * The record may have been deleted or committed while we + * were blocked. XXX remove? */ if (hammer_ip_iterate_mem_good(cursor, rec) == 0) { hammer_rel_mem_record(rec); @@ -539,13 +568,13 @@ hammer_rec_scan_callback(hammer_record_t rec, void *data) * record list. * * The lookup must fail if the record is marked for deferred deletion. + * + * The API for mem/btree_lookup() does not mess with the ATE/EOF bits. */ static int hammer_mem_lookup(hammer_cursor_t cursor) { - int error; - KKASSERT(cursor->ip); if (cursor->iprec) { hammer_rel_mem_record(cursor->iprec); @@ -554,19 +583,18 @@ hammer_mem_lookup(hammer_cursor_t cursor) hammer_rec_rb_tree_RB_SCAN(&cursor->ip->rec_tree, hammer_rec_find_cmp, hammer_rec_scan_callback, cursor); - if (cursor->iprec == NULL) - error = ENOENT; - else - error = 0; - return(error); + return (cursor->iprec ? 0 : ENOENT); } /* * hammer_mem_first() - locate the first in-memory record matching the * cursor within the bounds of the key range. + * + * WARNING! API is slightly different from btree_first(). hammer_mem_first() + * will set ATEMEM the same as MEMEOF, and does not return any error. */ static -int +void hammer_mem_first(hammer_cursor_t cursor) { hammer_inode_t ip; @@ -578,17 +606,13 @@ hammer_mem_first(hammer_cursor_t cursor) hammer_rel_mem_record(cursor->iprec); cursor->iprec = NULL; } - hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp, hammer_rec_scan_callback, cursor); - /* - * Adjust scan.node and keep it linked into the RB-tree so we can - * hold the cursor through third party modifications of the RB-tree. - */ if (cursor->iprec) - return(0); - return(ENOENT); + cursor->flags &= ~(HAMMER_CURSOR_MEMEOF | HAMMER_CURSOR_ATEMEM); + else + cursor->flags |= HAMMER_CURSOR_MEMEOF | HAMMER_CURSOR_ATEMEM; } /************************************************************************ @@ -719,15 +743,18 @@ hammer_ip_del_directory(struct hammer_transaction *trans, if (hammer_cursor_inmem(cursor)) { /* * In-memory (unsynchronized) records can simply be freed. + * * Even though the HAMMER_RECF_DELETED_FE flag is ignored * by the backend, we must still avoid races against the - * backend potentially syncing the record to the media. + * backend potentially syncing the record to the media. * * We cannot call hammer_ip_delete_record(), that routine may * only be called from the backend. */ record = cursor->iprec; - if (record->flags & HAMMER_RECF_INTERLOCK_BE) { + if (record->flags & (HAMMER_RECF_INTERLOCK_BE | + HAMMER_RECF_DELETED_BE | + HAMMER_RECF_COMMITTED)) { KKASSERT(cursor->deadlk_rec == NULL); hammer_ref(&record->lock); cursor->deadlk_rec = record; @@ -867,8 +894,10 @@ hammer_bulk_scan_callback(hammer_record_t record, void *data) { struct hammer_bulk_info *info = data; - if (record->flags & HAMMER_RECF_DELETED_FE) + if (record->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | + HAMMER_RECF_COMMITTED)) { return(0); + } hammer_ref(&record->lock); info->record = record; return(-1); /* stop scan */ @@ -1112,9 +1141,9 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) error = hammer_ip_delete_record(cursor, record->ip, trans->tid); if (error == 0) { - record->flags |= HAMMER_RECF_DELETED_FE; - record->flags |= HAMMER_RECF_DELETED_BE; - record->flags |= HAMMER_RECF_COMMITTED; + record->flags |= HAMMER_RECF_DELETED_BE | + HAMMER_RECF_COMMITTED; + ++record->ip->rec_generation; } } goto done; @@ -1194,11 +1223,13 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) kprintf("BTREE INSERT error %d @ %016llx:%d key %016llx\n", error, cursor->node->node_offset, cursor->index, record->leaf.base.key); /* - * Our record is on-disk, normally mark the in-memory version as - * deleted. If the record represented a directory deletion but - * we had to sync a valid directory entry to disk we must convert - * the record to a covering delete so the frontend does not have - * visibility on the synced entry. + * Our record is on-disk and we normally mark the in-memory version + * as having been committed (and not BE-deleted). + * + * If the record represented a directory deletion but we had to + * sync a valid directory entry to disk due to dependancies, + * we must convert the record to a covering delete so the + * frontend does not have visibility on the synced entry. */ if (error == 0) { if (doprop) { @@ -1207,17 +1238,27 @@ hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) &record->leaf); } if (record->flags & HAMMER_RECF_CONVERT_DELETE) { + /* + * Must convert deleted directory entry add + * to a directory entry delete. + */ KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); record->flags &= ~HAMMER_RECF_DELETED_FE; record->type = HAMMER_MEM_RECORD_DEL; KKASSERT(record->flush_state == HAMMER_FST_FLUSH); record->flags &= ~HAMMER_RECF_CONVERT_DELETE; + KKASSERT((record->flags & (HAMMER_RECF_COMMITTED | + HAMMER_RECF_DELETED_BE)) == 0); + /* converted record is not yet committed */ /* hammer_flush_record_done takes care of the rest */ } else { - record->flags |= HAMMER_RECF_DELETED_FE; - record->flags |= HAMMER_RECF_DELETED_BE; + /* + * Everything went fine and we are now done with + * this record. + */ + record->flags |= HAMMER_RECF_COMMITTED; + ++record->ip->rec_generation; } - record->flags |= HAMMER_RECF_COMMITTED; } else { if (record->leaf.data_offset) { hammer_blockmap_free(trans, record->leaf.data_offset, @@ -1316,6 +1357,101 @@ hammer_ip_lookup(hammer_cursor_t cursor) } /* + * Helper for hammer_ip_first()/hammer_ip_next() + * + * NOTE: Both ATEDISK and DISKEOF will be set the same. This sets up + * hammer_ip_first() for calling hammer_ip_next(), and sets up the re-seek + * state if hammer_ip_next() needs to re-seek. + */ +static __inline +int +_hammer_ip_seek_btree(hammer_cursor_t cursor) +{ + hammer_inode_t ip = cursor->ip; + int error; + + if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) { + error = hammer_btree_lookup(cursor); + if (error == ENOENT || error == EDEADLK) { + if (hammer_debug_general & 0x2000) + kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index); + cursor->flags &= ~HAMMER_CURSOR_ATEDISK; + error = hammer_btree_iterate(cursor); + } + if (error == 0) { + cursor->flags &= ~(HAMMER_CURSOR_DISKEOF | + HAMMER_CURSOR_ATEDISK); + } else { + cursor->flags |= HAMMER_CURSOR_DISKEOF | + HAMMER_CURSOR_ATEDISK; + if (error == ENOENT) + error = 0; + } + } else { + cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_ATEDISK; + error = 0; + } + return(error); +} + +/* + * Helper for hammer_ip_next() + * + * The caller has determined that the media cursor is further along than the + * memory cursor and must be reseeked after a generation number change. + */ +static +int +_hammer_ip_reseek(hammer_cursor_t cursor) +{ + struct hammer_base_elm save; + hammer_btree_elm_t elm; + int error; + int r; + int again = 0; + + /* + * Do the re-seek. + */ + kprintf("HAMMER: Debug: re-seeked during scan @ino=%016llx\n", + (long long)cursor->ip->obj_id); + save = cursor->key_beg; + cursor->key_beg = cursor->iprec->leaf.base; + error = _hammer_ip_seek_btree(cursor); + KKASSERT(error == 0); + cursor->key_beg = save; + + /* + * If the memory record was previous returned to + * the caller and the media record matches + * (-1/+1: only create_tid differs), then iterate + * the media record to avoid a double result. + */ + if ((cursor->flags & HAMMER_CURSOR_ATEDISK) == 0 && + (cursor->flags & HAMMER_CURSOR_LASTWASMEM)) { + elm = &cursor->node->ondisk->elms[cursor->index]; + r = hammer_btree_cmp(&elm->base, + &cursor->iprec->leaf.base); + if (cursor->flags & HAMMER_CURSOR_ASOF) { + if (r >= -1 && r <= 1) { + kprintf("HAMMER: Debug: iterated after " + "re-seek (asof r=%d)\n", r); + cursor->flags |= HAMMER_CURSOR_ATEDISK; + again = 1; + } + } else { + if (r == 0) { + kprintf("HAMMER: Debug: iterated after " + "re-seek\n"); + cursor->flags |= HAMMER_CURSOR_ATEDISK; + again = 1; + } + } + } + return(again); +} + +/* * Locate the first record within the cursor's key_beg/key_end range, * restricted to a particular inode. 0 is returned on success, ENOENT * if no records matched the requested range, or some other error. @@ -1326,6 +1462,7 @@ hammer_ip_lookup(hammer_cursor_t cursor) * This function can return EDEADLK, requiring the caller to terminate * the cursor and try again. */ + int hammer_ip_first(hammer_cursor_t cursor) { @@ -1338,62 +1475,30 @@ hammer_ip_first(hammer_cursor_t cursor) * Clean up fields and setup for merged scan */ cursor->flags &= ~HAMMER_CURSOR_RETEST; - cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM; - cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF; - if (cursor->iprec) { - hammer_rel_mem_record(cursor->iprec); - cursor->iprec = NULL; - } /* - * Search the on-disk B-Tree. hammer_btree_lookup() only does an - * exact lookup so if we get ENOENT we have to call the iterate - * function to validate the first record after the begin key. - * - * The ATEDISK flag is used by hammer_btree_iterate to determine - * whether it must index forwards or not. It is also used here - * to select the next record from in-memory or on-disk. + * Search the in-memory record list (Red-Black tree). Unlike the + * B-Tree search, mem_first checks for records in the range. * - * EDEADLK can only occur if the lookup hit an empty internal - * element and couldn't delete it. Since this could only occur - * in-range, we can just iterate from the failure point. + * This function will setup both ATEMEM and MEMEOF properly for + * the ip iteration. ATEMEM will be set if MEMEOF is set. */ - if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) { - error = hammer_btree_lookup(cursor); - if (error == ENOENT || error == EDEADLK) { - cursor->flags &= ~HAMMER_CURSOR_ATEDISK; - if (hammer_debug_general & 0x2000) - kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index); - error = hammer_btree_iterate(cursor); - } - if (error && error != ENOENT) - return(error); - if (error == 0) { - cursor->flags &= ~HAMMER_CURSOR_DISKEOF; - cursor->flags &= ~HAMMER_CURSOR_ATEDISK; - } else { - cursor->flags |= HAMMER_CURSOR_ATEDISK; - } - } + hammer_mem_first(cursor); /* - * Search the in-memory record list (Red-Black tree). Unlike the - * B-Tree search, mem_first checks for records in the range. + * Detect generation changes during blockages, including + * blockages which occur on the initial btree search. */ - error = hammer_mem_first(cursor); - if (error && error != ENOENT) - return(error); - if (error == 0) { - cursor->flags &= ~HAMMER_CURSOR_MEMEOF; - cursor->flags &= ~HAMMER_CURSOR_ATEMEM; - if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0) - cursor->flags |= HAMMER_CURSOR_ATEMEM; - } + cursor->rec_generation = cursor->ip->rec_generation; /* - * This will return the first matching record. + * Initial search and result */ - return(hammer_ip_next(cursor)); + error = _hammer_ip_seek_btree(cursor); + if (error == 0) + error = hammer_ip_next(cursor); + + return (error); } /* @@ -1401,6 +1506,9 @@ hammer_ip_first(hammer_cursor_t cursor) * cursor. This call may be made multiple times after the cursor has been * initially searched with hammer_ip_first(). * + * There are numerous special cases in this code to deal with races between + * in-memory records and on-media records. + * * 0 is returned on success, ENOENT if no further records match the * requested range, or some other error code is returned. */ @@ -1408,51 +1516,97 @@ int hammer_ip_next(hammer_cursor_t cursor) { hammer_btree_elm_t elm; - hammer_record_t rec, save; + hammer_record_t rec; + hammer_record_t tmprec; int error; int r; -next_btree: +again: /* - * Load the current on-disk and in-memory record. If we ate any - * records we have to get the next one. - * - * If we deleted the last on-disk record we had scanned ATEDISK will - * be clear and RETEST will be set, forcing a call to iterate. The - * fact that ATEDISK is clear causes iterate to re-test the 'current' - * element. If ATEDISK is set, iterate will skip the 'current' - * element. - * * Get the next on-disk record + * + * NOTE: If we deleted the last on-disk record we had scanned + * ATEDISK will be clear and RETEST will be set, forcing + * a call to iterate. The fact that ATEDISK is clear causes + * iterate to re-test the 'current' element. If ATEDISK is + * set, iterate will skip the 'current' element. */ - if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_RETEST)) { - if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { + error = 0; + if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { + if (cursor->flags & (HAMMER_CURSOR_ATEDISK | + HAMMER_CURSOR_RETEST)) { error = hammer_btree_iterate(cursor); cursor->flags &= ~HAMMER_CURSOR_RETEST; if (error == 0) { cursor->flags &= ~HAMMER_CURSOR_ATEDISK; hammer_cache_node(&cursor->ip->cache[1], cursor->node); - } else { + } else if (error == ENOENT) { cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_ATEDISK; + error = 0; } } } -next_memory: /* - * Get the next in-memory record. + * If the generation changed the backend has deleted or committed + * one or more memory records since our last check. + * + * When this case occurs if the disk cursor is > current memory record + * or the disk cursor is at EOF, we must re-seek the disk-cursor. + * Since the cursor is ahead it must have not yet been eaten (if + * not at eof anyway). (XXX data offset case?) + * + * NOTE: we are not doing a full check here. That will be handled + * later on. + * + * If we have exhausted all memory records we do not have to do any + * further seeks. + */ + while (cursor->rec_generation != cursor->ip->rec_generation && + error == 0 + ) { + kprintf("HAMMER: Debug: generation changed during scan @ino=%016llx\n", (long long)cursor->ip->obj_id); + cursor->rec_generation = cursor->ip->rec_generation; + if (cursor->flags & HAMMER_CURSOR_MEMEOF) + break; + if (cursor->flags & HAMMER_CURSOR_DISKEOF) { + r = 1; + } else { + KKASSERT((cursor->flags & HAMMER_CURSOR_ATEDISK) == 0); + elm = &cursor->node->ondisk->elms[cursor->index]; + r = hammer_btree_cmp(&elm->base, + &cursor->iprec->leaf.base); + } + + /* + * Do we re-seek the media cursor? + */ + if (r > 0) { + if (_hammer_ip_reseek(cursor)) + goto again; + } + } + + /* + * We can now safely get the next in-memory record. We cannot + * block here. * * hammer_rec_scan_cmp: Is the record still in our general range, * (non-inclusive of snapshot exclusions)? * hammer_rec_scan_callback: Is the record in our snapshot? */ - if (cursor->flags & HAMMER_CURSOR_ATEMEM) { - if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { - save = cursor->iprec; + tmprec = NULL; + if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { + /* + * If the current memory record was eaten then get the next + * one. Stale records are skipped. + */ + if (cursor->flags & HAMMER_CURSOR_ATEMEM) { + tmprec = cursor->iprec; cursor->iprec = NULL; - rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL; + rec = hammer_rec_rb_tree_RB_NEXT(tmprec); while (rec) { if (hammer_rec_scan_cmp(rec, cursor) != 0) break; @@ -1460,28 +1614,39 @@ next_memory: break; rec = hammer_rec_rb_tree_RB_NEXT(rec); } - if (save) - hammer_rel_mem_record(save); if (cursor->iprec) { KKASSERT(cursor->iprec == rec); cursor->flags &= ~HAMMER_CURSOR_ATEMEM; } else { cursor->flags |= HAMMER_CURSOR_MEMEOF; } + cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; } } /* - * The memory record may have become stale while being held in - * cursor->iprec. We are interlocked against the backend on - * with regards to B-Tree entries. + * MEMORY RECORD VALIDITY TEST + * + * (We still can't block, which is why tmprec is being held so + * long). + * + * If the memory record is no longer valid we skip it. It may + * have been deleted by the frontend. If it was deleted or + * committed by the backend the generation change re-seeked the + * disk cursor and the record will be present there. */ - if ((cursor->flags & HAMMER_CURSOR_ATEMEM) == 0) { - if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0) { + if (error == 0 && (cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { + KKASSERT(cursor->iprec); + KKASSERT((cursor->flags & HAMMER_CURSOR_ATEMEM) == 0); + if (!hammer_ip_iterate_mem_good(cursor, cursor->iprec)) { cursor->flags |= HAMMER_CURSOR_ATEMEM; - goto next_memory; + if (tmprec) + hammer_rel_mem_record(tmprec); + goto again; } } + if (tmprec) + hammer_rel_mem_record(tmprec); /* * Extract either the disk or memory record depending on their @@ -1518,6 +1683,7 @@ next_memory: error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); cursor->flags |= HAMMER_CURSOR_ATEDISK; + cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; break; } @@ -1544,7 +1710,7 @@ next_memory: if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { cursor->flags |= HAMMER_CURSOR_ATEDISK; cursor->flags |= HAMMER_CURSOR_ATEMEM; - goto next_btree; + goto again; } } else if (cursor->iprec->type == HAMMER_MEM_RECORD_DATA) { if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { @@ -1554,7 +1720,7 @@ next_memory: } else { panic("hammer_ip_next: duplicate mem/b-tree entry %p %d %08x", cursor->iprec, cursor->iprec->type, cursor->iprec->flags); cursor->flags |= HAMMER_CURSOR_ATEMEM; - goto next_memory; + goto again; } } /* fall through to the memory entry */ @@ -1564,6 +1730,7 @@ next_memory: */ cursor->leaf = &cursor->iprec->leaf; cursor->flags |= HAMMER_CURSOR_ATEMEM; + cursor->flags |= HAMMER_CURSOR_LASTWASMEM; /* * If the memory entry is an on-disk deletion we should have @@ -1582,6 +1749,7 @@ next_memory: */ error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); cursor->flags |= HAMMER_CURSOR_ATEDISK; + cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; break; default: /* @@ -1589,6 +1757,7 @@ next_memory: * * XXX error not set properly */ + cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; cursor->leaf = NULL; error = ENOENT; break; @@ -1964,6 +2133,8 @@ hammer_ip_delete_record(hammer_cursor_t cursor, hammer_inode_t ip, KKASSERT((iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0); iprec->flags |= HAMMER_RECF_DELETED_FE; iprec->flags |= HAMMER_RECF_DELETED_BE; + KKASSERT(iprec->ip == ip); + ++ip->rec_generation; return(0); } diff --git a/sys/vfs/hammer/hammer_vnops.c b/sys/vfs/hammer/hammer_vnops.c index c2abe5f..2aa6790 100644 --- a/sys/vfs/hammer/hammer_vnops.c +++ b/sys/vfs/hammer/hammer_vnops.c @@ -1298,6 +1298,10 @@ hammer_vop_readdir(struct vop_readdir_args *ap) /* * Handle artificial entries + * + * It should be noted that the minimum value for a directory + * hash key on-media is 0x0000000100000000, so we can use anything + * less then that to represent our 'special' key space. */ error = 0; if (saveoff == 0) {