diff options
Diffstat (limited to 'fs/btrfs/transaction.c')
-rw-r--r-- | fs/btrfs/transaction.c | 152 |
1 files changed, 92 insertions, 60 deletions
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6af7f2bf92de..acff6bb49a97 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -21,6 +21,7 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -107,6 +108,11 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { __TRANS_JOIN | __TRANS_JOIN_NOLOCK | __TRANS_JOIN_NOSTART), + [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK | + __TRANS_JOIN_NOSTART), [TRANS_STATE_COMPLETED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | @@ -375,6 +381,8 @@ loop: spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); + INIT_LIST_HEAD(&cur_trans->releasing_ebs); + spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); @@ -826,10 +834,11 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) return trans; } -/* wait for a transaction commit to be fully complete */ -static noinline void wait_for_commit(struct btrfs_transaction *commit) +/* Wait for a transaction commit to reach at least the given state. */ +static noinline void wait_for_commit(struct btrfs_transaction *commit, + const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED); + wait_event(commit->commit_wait, commit->state >= min_state); } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) @@ -884,7 +893,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) goto out; /* nothing committing|committed */ } - wait_for_commit(cur_trans); + wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); btrfs_put_transaction(cur_trans); out: return ret; @@ -909,9 +918,8 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; - smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START || - cur_trans->delayed_refs.flushing) + test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) return true; return should_end_transaction(trans); @@ -1230,10 +1238,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; - ret = btrfs_run_dev_stats(trans); if (ret) return ret; @@ -1248,10 +1252,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) if (ret) return ret; - /* run_qgroups might have added some more refs */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { struct btrfs_root *root; @@ -1266,15 +1266,24 @@ again: ret = update_cowonly_root(trans, root); if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; } + /* Now flush any delayed refs generated by updating all of the roots */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) + return ret; + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { ret = btrfs_write_dirty_block_groups(trans); if (ret) return ret; + + /* + * We're writing the dirty block groups, which could generate + * delayed refs, which could generate more dirty block groups, + * so we want to keep this flushing in this loop to make sure + * everything gets run. + */ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) return ret; @@ -1319,7 +1328,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) struct btrfs_root *gang[8]; int i; int ret; - int err = 0; spin_lock(&fs_info->fs_roots_radix_lock); while (1) { @@ -1331,6 +1339,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) break; for (i = 0; i < ret; i++) { struct btrfs_root *root = gang[i]; + int ret2; + radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); @@ -1350,17 +1360,17 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) root->node); } - err = btrfs_update_root(trans, fs_info->tree_root, + ret2 = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); + if (ret2) + return ret2; spin_lock(&fs_info->fs_roots_radix_lock); - if (err) - break; btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); - return err; + return 0; } /* @@ -1433,6 +1443,23 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, record_root_in_trans(trans, src, 1); /* + * btrfs_qgroup_inherit relies on a consistent view of the usage for the + * src root, so we must run the delayed refs here. + * + * However this isn't particularly fool proof, because there's no + * synchronization keeping us from changing the tree after this point + * before we do the qgroup_inherit, or even from making changes while + * we're doing the qgroup_inherit. But that's a problem for the future, + * for now flush the delayed refs to narrow the race window where the + * qgroup counters could end up wrong. + */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + + /* * We are going to commit transaction, see btrfs_commit_transaction() * comment for reason locking tree_log_mutex */ @@ -1525,7 +1552,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ASSERT(pending->root_item); new_root_item = pending->root_item; - pending->error = btrfs_find_free_objectid(tree_root, &objectid); + pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) goto no_free_objectid; @@ -1685,12 +1712,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - /* * Do special qgroup accounting for snapshot, as we do some qgroup * snapshot hack to do fast snapshot. @@ -1738,12 +1759,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } } - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - fail: pending->error = ret; dir_item_existed: @@ -2042,32 +2057,25 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; - /* make a pass through all the delayed refs we have so far - * any runnings procs may add more while we are here - */ - ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } - - cur_trans = trans->transaction; - /* - * set the flushing flag so procs in this transaction have to - * start sending their work down. + * We only want one transaction commit doing the flushing so we do not + * waste a bunch of time on lock contention on the extent root node. */ - cur_trans->delayed_refs.flushing = 1; - smp_wmb(); + if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING, + &cur_trans->delayed_refs.flags)) { + /* + * Make a pass through all the delayed refs we have so far. + * Any running threads may add more while we are here. + */ + ret = btrfs_run_delayed_refs(trans, 0); + if (ret) { + btrfs_end_transaction(trans); + return ret; + } + } btrfs_create_pending_block_groups(trans); - ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } - if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { int run_it = 0; @@ -2101,11 +2109,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + spin_unlock(&fs_info->trans_lock); refcount_inc(&cur_trans->use_count); - ret = btrfs_end_transaction(trans); - wait_for_commit(cur_trans); + if (trans->in_fsync) + want_state = TRANS_STATE_SUPER_COMMITTED; + ret = btrfs_end_transaction(trans); + wait_for_commit(cur_trans, want_state); if (TRANS_ABORTED(cur_trans)) ret = cur_trans->aborted; @@ -2119,13 +2131,19 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_blocked_wait); if (cur_trans->list.prev != &fs_info->trans_list) { + enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + + if (trans->in_fsync) + want_state = TRANS_STATE_SUPER_COMMITTED; + prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); - if (prev_trans->state != TRANS_STATE_COMPLETED) { + if (prev_trans->state < want_state) { refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); - wait_for_commit(prev_trans); + wait_for_commit(prev_trans, want_state); + ret = READ_ONCE(prev_trans->aborted); btrfs_put_transaction(prev_trans); @@ -2335,6 +2353,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } + /* + * At this point, we should have written all the tree blocks allocated + * in this transaction. So it's now safe to free the redirtyied extent + * buffers. + */ + btrfs_free_redirty_list(cur_trans); + ret = write_all_supers(fs_info, 0); /* * the super is written, we can safely allow the tree-loggers @@ -2344,6 +2369,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto scrub_continue; + /* + * We needn't acquire the lock here because there is no other task + * which can change it. + */ + cur_trans->state = TRANS_STATE_SUPER_COMMITTED; + wake_up(&cur_trans->commit_wait); + btrfs_finish_extent_commit(trans); if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) |