diff options
Diffstat (limited to 'fs/btrfs/block-group.c')
-rw-r--r-- | fs/btrfs/block-group.c | 221 |
1 files changed, 161 insertions, 60 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 0886e81e5540..744b99ddc28c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -15,6 +15,7 @@ #include "delalloc-space.h" #include "discard.h" #include "raid56.h" +#include "zoned.h" /* * Return target flags in extended format or 0 if restripe for this chunk_type @@ -673,7 +674,15 @@ static noinline void caching_thread(struct btrfs_work *work) wake_up(&caching_ctl->wait); } - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + /* + * If we are in the transaction that populated the free space tree we + * can't actually cache from the free space tree as our commit root and + * real root are the same, so we could change the contents of the blocks + * while caching. Instead do the slow caching in this case, and after + * the transaction has committed we will be safe. + */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) ret = load_free_space_tree(caching_ctl); else ret = load_extent_tree_free(caching_ctl); @@ -716,6 +725,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only struct btrfs_caching_control *caching_ctl = NULL; int ret = 0; + /* Allocator for zoned filesystems does not use the cache at all */ + if (btrfs_is_zoned(fs_info)) + return 0; + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); if (!caching_ctl) return -ENOMEM; @@ -888,6 +901,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&cluster->refill_lock); + btrfs_clear_treelog_bg(block_group); + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; @@ -1000,12 +1015,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, WARN_ON(block_group->space_info->total_bytes < block_group->length); WARN_ON(block_group->space_info->bytes_readonly - < block_group->length); + < block_group->length - block_group->zone_unusable); + WARN_ON(block_group->space_info->bytes_zone_unusable + < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); } block_group->space_info->total_bytes -= block_group->length; - block_group->space_info->bytes_readonly -= block_group->length; + block_group->space_info->bytes_readonly -= + (block_group->length - block_group->zone_unusable); + block_group->space_info->bytes_zone_unusable -= + block_group->zone_unusable; block_group->space_info->disk_total -= block_group->length * factor; spin_unlock(&block_group->space_info->lock); @@ -1142,6 +1162,11 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) spin_lock(&sinfo->lock); spin_lock(&cache->lock); + if (cache->swap_extents) { + ret = -ETXTBSY; + goto out; + } + if (cache->ro) { cache->ro++; ret = 0; @@ -1149,7 +1174,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) } num_bytes = cache->length - cache->reserved - cache->pinned - - cache->bytes_super - cache->used; + cache->bytes_super - cache->zone_unusable - cache->used; /* * Data never overcommits, even in mixed mode, so do just the straight @@ -1180,6 +1205,12 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) if (!ret) { sinfo->bytes_readonly += num_bytes; + if (btrfs_is_zoned(cache->fs_info)) { + /* Migrate zone_unusable bytes to readonly */ + sinfo->bytes_readonly += cache->zone_unusable; + sinfo->bytes_zone_unusable -= cache->zone_unusable; + cache->zone_unusable = 0; + } cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); } @@ -1254,6 +1285,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; + /* + * Long running balances can keep us blocked here for eternity, so + * simply skip deletion if we're unable to get the mutex. + */ + if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex)) + return; + spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { int trimming; @@ -1273,8 +1311,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); - mutex_lock(&fs_info->delete_unused_bgs_mutex); - /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); @@ -1363,9 +1399,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_space_info_update_bytes_pinned(fs_info, space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -block_group->pinned, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned); block_group->pinned = 0; spin_unlock(&block_group->lock); @@ -1381,8 +1415,12 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) goto flip_async; - /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(fs_info, DISCARD_SYNC); + /* + * DISCARD can flip during remount. On zoned filesystems, we + * need to reset sequential-required zones. + */ + trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_is_zoned(fs_info); /* Implicit trim during transaction commit. */ if (trimming) @@ -1420,11 +1458,11 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) end_trans: btrfs_end_transaction(trans); next: - mutex_unlock(&fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); return; flip_async: @@ -1553,8 +1591,11 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } /** - * btrfs_rmap_block - Map a physical disk address to a list of logical addresses + * Map a physical disk address to a list of logical addresses + * + * @fs_info: the filesystem * @chunk_start: logical address of block group + * @bdev: physical device to resolve, can be NULL to indicate any device * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical * @naddrs: length of @logical @@ -1564,9 +1605,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * Used primarily to exclude those portions of a block group that contain super * block copies. */ -EXPORT_FOR_TESTS int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len) + struct block_device *bdev, u64 physical, u64 **logical, + int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -1584,6 +1625,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, map = em->map_lookup; data_stripe_length = em->orig_block_len; io_stripe_size = map->stripe_len; + chunk_start = em->start; /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -1598,14 +1640,18 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, for (i = 0; i < map->num_stripes; i++) { bool already_inserted = false; u64 stripe_nr; + u64 offset; int j; if (!in_range(physical, map->stripes[i].physical, data_stripe_length)) continue; + if (bdev && map->stripes[i].dev->bdev != bdev) + continue; + stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div64_u64(stripe_nr, map->stripe_len); + stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; @@ -1619,7 +1665,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, * instead of map->stripe_len */ - bytenr = chunk_start + stripe_nr * io_stripe_size; + bytenr = chunk_start + stripe_nr * io_stripe_size + offset; /* Ensure we don't add duplicate addresses */ for (j = 0; j < nr; j++) { @@ -1661,7 +1707,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->start, + ret = btrfs_rmap_block(fs_info, cache->start, NULL, bytenr, &logical, &nr, &stripe_len); if (ret) return ret; @@ -1797,24 +1843,8 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } -static void read_block_group_item(struct btrfs_block_group *cache, - struct btrfs_path *path, - const struct btrfs_key *key) -{ - struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_block_group_item bgi; - int slot = path->slots[0]; - - cache->length = key->offset; - - read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), - sizeof(bgi)); - cache->used = btrfs_stack_block_group_used(&bgi); - cache->flags = btrfs_stack_block_group_flags(&bgi); -} - static int read_one_block_group(struct btrfs_fs_info *info, - struct btrfs_path *path, + struct btrfs_block_group_item *bgi, const struct btrfs_key *key, int need_clear) { @@ -1829,7 +1859,9 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (!cache) return -ENOMEM; - read_block_group_item(cache, path, key); + cache->length = key->offset; + cache->used = btrfs_stack_block_group_used(bgi); + cache->flags = btrfs_stack_block_group_flags(bgi); set_free_space_tree_thresholds(cache); @@ -1856,6 +1888,13 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } + ret = btrfs_load_block_group_zone_info(cache, false); + if (ret) { + btrfs_err(info, "zoned: failed to load zone info of bg %llu", + cache->start); + goto error; + } + /* * We need to exclude the super stripes now so that the space info has * super bytes accounted for, otherwise we'll think we have more space @@ -1869,12 +1908,20 @@ static int read_one_block_group(struct btrfs_fs_info *info, } /* - * Check for two cases, either we are full, and therefore don't need - * to bother with the caching work since we won't find any space, or we - * are empty, and we can just add all the space in and be done with it. - * This saves us _a_lot_ of time, particularly in the full case. + * For zoned filesystem, space after the allocation offset is the only + * free space for a block group. So, we don't need any caching work. + * btrfs_calc_zone_unusable() will set the amount of free space and + * zone_unusable space. + * + * For regular filesystem, check for two cases, either we are full, and + * therefore don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all the space + * in and be done with it. This saves us _a_lot_ of time, particularly + * in the full case. */ - if (cache->length == cache->used) { + if (btrfs_is_zoned(info)) { + btrfs_calc_zone_unusable(cache); + } else if (cache->length == cache->used) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); @@ -1893,7 +1940,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, } trace_btrfs_add_block_group(info, cache, 0); btrfs_update_space_info(info, cache->flags, cache->length, - cache->used, cache->bytes_super, &space_info); + cache->used, cache->bytes_super, + cache->zone_unusable, &space_info); cache->space_info = space_info; @@ -1949,7 +1997,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) break; } btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, - 0, &space_info); + 0, 0, &space_info); bg->space_info = space_info; link_block_group(bg); @@ -1988,19 +2036,29 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) need_clear = 1; while (1) { + struct btrfs_block_group_item bgi; + struct extent_buffer *leaf; + int slot; + ret = find_first_block_group(info, path, &key); if (ret > 0) break; if (ret != 0) goto error; - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - ret = read_one_block_group(info, path, &key, need_clear); + leaf = path->nodes[0]; + slot = path->slots[0]; + + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + + btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_release_path(path); + ret = read_one_block_group(info, &bgi, &key, need_clear); if (ret < 0) goto error; key.objectid += key.offset; key.offset = 0; - btrfs_release_path(path); } btrfs_release_path(path); @@ -2132,6 +2190,13 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, cache->cached = BTRFS_CACHE_FINISHED; if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) cache->needs_free_space = 1; + + ret = btrfs_load_block_group_zone_info(cache, true); + if (ret) { + btrfs_put_block_group(cache); + return ret; + } + ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2173,7 +2238,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, */ trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, &cache->space_info); + cache->bytes_super, 0, &cache->space_info); btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -2247,7 +2312,7 @@ again: } ret = inc_block_group_ro(cache, 0); - if (!do_chunk_alloc) + if (!do_chunk_alloc || ret == -ETXTBSY) goto unlock_out; if (!ret) goto out; @@ -2256,6 +2321,8 @@ again: if (ret < 0) goto out; ret = inc_block_group_ro(cache, 0); + if (ret == -ETXTBSY) + goto unlock_out; out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); @@ -2281,8 +2348,15 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) spin_lock(&cache->lock); if (!--cache->ro) { num_bytes = cache->length - cache->reserved - - cache->pinned - cache->bytes_super - cache->used; + cache->pinned - cache->bytes_super - + cache->zone_unusable - cache->used; sinfo->bytes_readonly -= num_bytes; + if (btrfs_is_zoned(cache->fs_info)) { + /* Migrate zone_unusable bytes back */ + cache->zone_unusable = cache->alloc_offset - cache->used; + sinfo->bytes_zone_unusable += cache->zone_unusable; + sinfo->bytes_readonly -= cache->zone_unusable; + } list_del_init(&cache->ro_list); } spin_unlock(&cache->lock); @@ -2556,8 +2630,10 @@ again: if (!path) { path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } } /* @@ -2651,16 +2727,14 @@ again: btrfs_put_block_group(cache); if (drop_reserve) btrfs_delayed_refs_rsv_release(fs_info, 1); - - if (ret) - break; - /* * Avoid blocking other tasks for too long. It might even save * us from writing caches for block groups that are going to be * removed. */ mutex_unlock(&trans->transaction->cache_write_mutex); + if (ret) + goto out; mutex_lock(&trans->transaction->cache_write_mutex); } mutex_unlock(&trans->transaction->cache_write_mutex); @@ -2684,7 +2758,12 @@ again: goto again; } spin_unlock(&cur_trans->dirty_bgs_lock); - } else if (ret < 0) { + } +out: + if (ret < 0) { + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&dirty, &cur_trans->dirty_bgs); + spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } @@ -2888,10 +2967,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch( - &cache->space_info->total_bytes_pinned, - num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(cache->space_info, + num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); @@ -3336,6 +3413,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) ASSERT(list_empty(&block_group->io_list)); ASSERT(list_empty(&block_group->bg_list)); ASSERT(refcount_read(&block_group->refs) == 1); + ASSERT(block_group->swap_extents == 0); btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); @@ -3402,3 +3480,26 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) __btrfs_remove_free_space_cache(block_group->free_space_ctl); } } + +bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg) +{ + bool ret = true; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + bg->swap_extents++; + spin_unlock(&bg->lock); + + return ret; +} + +void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount) +{ + spin_lock(&bg->lock); + ASSERT(!bg->ro); + ASSERT(bg->swap_extents >= amount); + bg->swap_extents -= amount; + spin_unlock(&bg->lock); +} |