summaryrefslogtreecommitdiff
path: root/fs/btrfs/block-group.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/block-group.c')
-rw-r--r--fs/btrfs/block-group.c221
1 files changed, 161 insertions, 60 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 0886e81e5540..744b99ddc28c 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -15,6 +15,7 @@
#include "delalloc-space.h"
#include "discard.h"
#include "raid56.h"
+#include "zoned.h"
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -673,7 +674,15 @@ static noinline void caching_thread(struct btrfs_work *work)
wake_up(&caching_ctl->wait);
}
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ /*
+ * If we are in the transaction that populated the free space tree we
+ * can't actually cache from the free space tree as our commit root and
+ * real root are the same, so we could change the contents of the blocks
+ * while caching. Instead do the slow caching in this case, and after
+ * the transaction has committed we will be safe.
+ */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
ret = load_free_space_tree(caching_ctl);
else
ret = load_extent_tree_free(caching_ctl);
@@ -716,6 +725,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
struct btrfs_caching_control *caching_ctl = NULL;
int ret = 0;
+ /* Allocator for zoned filesystems does not use the cache at all */
+ if (btrfs_is_zoned(fs_info))
+ return 0;
+
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
if (!caching_ctl)
return -ENOMEM;
@@ -888,6 +901,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
btrfs_return_cluster_to_free_space(block_group, cluster);
spin_unlock(&cluster->refill_lock);
+ btrfs_clear_treelog_bg(block_group);
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -1000,12 +1015,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
WARN_ON(block_group->space_info->total_bytes
< block_group->length);
WARN_ON(block_group->space_info->bytes_readonly
- < block_group->length);
+ < block_group->length - block_group->zone_unusable);
+ WARN_ON(block_group->space_info->bytes_zone_unusable
+ < block_group->zone_unusable);
WARN_ON(block_group->space_info->disk_total
< block_group->length * factor);
}
block_group->space_info->total_bytes -= block_group->length;
- block_group->space_info->bytes_readonly -= block_group->length;
+ block_group->space_info->bytes_readonly -=
+ (block_group->length - block_group->zone_unusable);
+ block_group->space_info->bytes_zone_unusable -=
+ block_group->zone_unusable;
block_group->space_info->disk_total -= block_group->length * factor;
spin_unlock(&block_group->space_info->lock);
@@ -1142,6 +1162,11 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
+ if (cache->swap_extents) {
+ ret = -ETXTBSY;
+ goto out;
+ }
+
if (cache->ro) {
cache->ro++;
ret = 0;
@@ -1149,7 +1174,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
}
num_bytes = cache->length - cache->reserved - cache->pinned -
- cache->bytes_super - cache->used;
+ cache->bytes_super - cache->zone_unusable - cache->used;
/*
* Data never overcommits, even in mixed mode, so do just the straight
@@ -1180,6 +1205,12 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
if (!ret) {
sinfo->bytes_readonly += num_bytes;
+ if (btrfs_is_zoned(cache->fs_info)) {
+ /* Migrate zone_unusable bytes to readonly */
+ sinfo->bytes_readonly += cache->zone_unusable;
+ sinfo->bytes_zone_unusable -= cache->zone_unusable;
+ cache->zone_unusable = 0;
+ }
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
}
@@ -1254,6 +1285,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
+ /*
+ * Long running balances can keep us blocked here for eternity, so
+ * simply skip deletion if we're unable to get the mutex.
+ */
+ if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex))
+ return;
+
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
int trimming;
@@ -1273,8 +1311,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
-
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
@@ -1363,9 +1399,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
btrfs_space_info_update_bytes_pinned(fs_info, space_info,
-block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
- percpu_counter_add_batch(&space_info->total_bytes_pinned,
- -block_group->pinned,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned);
block_group->pinned = 0;
spin_unlock(&block_group->lock);
@@ -1381,8 +1415,12 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
goto flip_async;
- /* DISCARD can flip during remount */
- trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
+ /*
+ * DISCARD can flip during remount. On zoned filesystems, we
+ * need to reset sequential-required zones.
+ */
+ trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_is_zoned(fs_info);
/* Implicit trim during transaction commit. */
if (trimming)
@@ -1420,11 +1458,11 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
end_trans:
btrfs_end_transaction(trans);
next:
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
return;
flip_async:
@@ -1553,8 +1591,11 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
/**
- * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
+ * Map a physical disk address to a list of logical addresses
+ *
+ * @fs_info: the filesystem
* @chunk_start: logical address of block group
+ * @bdev: physical device to resolve, can be NULL to indicate any device
* @physical: physical address to map to logical addresses
* @logical: return array of logical addresses which map to @physical
* @naddrs: length of @logical
@@ -1564,9 +1605,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
* Used primarily to exclude those portions of a block group that contain super
* block copies.
*/
-EXPORT_FOR_TESTS
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
- u64 physical, u64 **logical, int *naddrs, int *stripe_len)
+ struct block_device *bdev, u64 physical, u64 **logical,
+ int *naddrs, int *stripe_len)
{
struct extent_map *em;
struct map_lookup *map;
@@ -1584,6 +1625,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
map = em->map_lookup;
data_stripe_length = em->orig_block_len;
io_stripe_size = map->stripe_len;
+ chunk_start = em->start;
/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -1598,14 +1640,18 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
for (i = 0; i < map->num_stripes; i++) {
bool already_inserted = false;
u64 stripe_nr;
+ u64 offset;
int j;
if (!in_range(physical, map->stripes[i].physical,
data_stripe_length))
continue;
+ if (bdev && map->stripes[i].dev->bdev != bdev)
+ continue;
+
stripe_nr = physical - map->stripes[i].physical;
- stripe_nr = div64_u64(stripe_nr, map->stripe_len);
+ stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
stripe_nr = stripe_nr * map->num_stripes + i;
@@ -1619,7 +1665,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
* instead of map->stripe_len
*/
- bytenr = chunk_start + stripe_nr * io_stripe_size;
+ bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
/* Ensure we don't add duplicate addresses */
for (j = 0; j < nr; j++) {
@@ -1661,7 +1707,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
bytenr = btrfs_sb_offset(i);
- ret = btrfs_rmap_block(fs_info, cache->start,
+ ret = btrfs_rmap_block(fs_info, cache->start, NULL,
bytenr, &logical, &nr, &stripe_len);
if (ret)
return ret;
@@ -1797,24 +1843,8 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
return ret;
}
-static void read_block_group_item(struct btrfs_block_group *cache,
- struct btrfs_path *path,
- const struct btrfs_key *key)
-{
- struct extent_buffer *leaf = path->nodes[0];
- struct btrfs_block_group_item bgi;
- int slot = path->slots[0];
-
- cache->length = key->offset;
-
- read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
- sizeof(bgi));
- cache->used = btrfs_stack_block_group_used(&bgi);
- cache->flags = btrfs_stack_block_group_flags(&bgi);
-}
-
static int read_one_block_group(struct btrfs_fs_info *info,
- struct btrfs_path *path,
+ struct btrfs_block_group_item *bgi,
const struct btrfs_key *key,
int need_clear)
{
@@ -1829,7 +1859,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
if (!cache)
return -ENOMEM;
- read_block_group_item(cache, path, key);
+ cache->length = key->offset;
+ cache->used = btrfs_stack_block_group_used(bgi);
+ cache->flags = btrfs_stack_block_group_flags(bgi);
set_free_space_tree_thresholds(cache);
@@ -1856,6 +1888,13 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
+ ret = btrfs_load_block_group_zone_info(cache, false);
+ if (ret) {
+ btrfs_err(info, "zoned: failed to load zone info of bg %llu",
+ cache->start);
+ goto error;
+ }
+
/*
* We need to exclude the super stripes now so that the space info has
* super bytes accounted for, otherwise we'll think we have more space
@@ -1869,12 +1908,20 @@ static int read_one_block_group(struct btrfs_fs_info *info,
}
/*
- * Check for two cases, either we are full, and therefore don't need
- * to bother with the caching work since we won't find any space, or we
- * are empty, and we can just add all the space in and be done with it.
- * This saves us _a_lot_ of time, particularly in the full case.
+ * For zoned filesystem, space after the allocation offset is the only
+ * free space for a block group. So, we don't need any caching work.
+ * btrfs_calc_zone_unusable() will set the amount of free space and
+ * zone_unusable space.
+ *
+ * For regular filesystem, check for two cases, either we are full, and
+ * therefore don't need to bother with the caching work since we won't
+ * find any space, or we are empty, and we can just add all the space
+ * in and be done with it. This saves us _a_lot_ of time, particularly
+ * in the full case.
*/
- if (cache->length == cache->used) {
+ if (btrfs_is_zoned(info)) {
+ btrfs_calc_zone_unusable(cache);
+ } else if (cache->length == cache->used) {
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
btrfs_free_excluded_extents(cache);
@@ -1893,7 +1940,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
}
trace_btrfs_add_block_group(info, cache, 0);
btrfs_update_space_info(info, cache->flags, cache->length,
- cache->used, cache->bytes_super, &space_info);
+ cache->used, cache->bytes_super,
+ cache->zone_unusable, &space_info);
cache->space_info = space_info;
@@ -1949,7 +1997,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
break;
}
btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
- 0, &space_info);
+ 0, 0, &space_info);
bg->space_info = space_info;
link_block_group(bg);
@@ -1988,19 +2036,29 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
need_clear = 1;
while (1) {
+ struct btrfs_block_group_item bgi;
+ struct extent_buffer *leaf;
+ int slot;
+
ret = find_first_block_group(info, path, &key);
if (ret > 0)
break;
if (ret != 0)
goto error;
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- ret = read_one_block_group(info, path, &key, need_clear);
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+
+ read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
+ sizeof(bgi));
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ btrfs_release_path(path);
+ ret = read_one_block_group(info, &bgi, &key, need_clear);
if (ret < 0)
goto error;
key.objectid += key.offset;
key.offset = 0;
- btrfs_release_path(path);
}
btrfs_release_path(path);
@@ -2132,6 +2190,13 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
cache->cached = BTRFS_CACHE_FINISHED;
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
cache->needs_free_space = 1;
+
+ ret = btrfs_load_block_group_zone_info(cache, true);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ return ret;
+ }
+
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
@@ -2173,7 +2238,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
*/
trace_btrfs_add_block_group(fs_info, cache, 1);
btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
- cache->bytes_super, &cache->space_info);
+ cache->bytes_super, 0, &cache->space_info);
btrfs_update_global_block_rsv(fs_info);
link_block_group(cache);
@@ -2247,7 +2312,7 @@ again:
}
ret = inc_block_group_ro(cache, 0);
- if (!do_chunk_alloc)
+ if (!do_chunk_alloc || ret == -ETXTBSY)
goto unlock_out;
if (!ret)
goto out;
@@ -2256,6 +2321,8 @@ again:
if (ret < 0)
goto out;
ret = inc_block_group_ro(cache, 0);
+ if (ret == -ETXTBSY)
+ goto unlock_out;
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
@@ -2281,8 +2348,15 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
spin_lock(&cache->lock);
if (!--cache->ro) {
num_bytes = cache->length - cache->reserved -
- cache->pinned - cache->bytes_super - cache->used;
+ cache->pinned - cache->bytes_super -
+ cache->zone_unusable - cache->used;
sinfo->bytes_readonly -= num_bytes;
+ if (btrfs_is_zoned(cache->fs_info)) {
+ /* Migrate zone_unusable bytes back */
+ cache->zone_unusable = cache->alloc_offset - cache->used;
+ sinfo->bytes_zone_unusable += cache->zone_unusable;
+ sinfo->bytes_readonly -= cache->zone_unusable;
+ }
list_del_init(&cache->ro_list);
}
spin_unlock(&cache->lock);
@@ -2556,8 +2630,10 @@ again:
if (!path) {
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
}
/*
@@ -2651,16 +2727,14 @@ again:
btrfs_put_block_group(cache);
if (drop_reserve)
btrfs_delayed_refs_rsv_release(fs_info, 1);
-
- if (ret)
- break;
-
/*
* Avoid blocking other tasks for too long. It might even save
* us from writing caches for block groups that are going to be
* removed.
*/
mutex_unlock(&trans->transaction->cache_write_mutex);
+ if (ret)
+ goto out;
mutex_lock(&trans->transaction->cache_write_mutex);
}
mutex_unlock(&trans->transaction->cache_write_mutex);
@@ -2684,7 +2758,12 @@ again:
goto again;
}
spin_unlock(&cur_trans->dirty_bgs_lock);
- } else if (ret < 0) {
+ }
+out:
+ if (ret < 0) {
+ spin_lock(&cur_trans->dirty_bgs_lock);
+ list_splice_init(&dirty, &cur_trans->dirty_bgs);
+ spin_unlock(&cur_trans->dirty_bgs_lock);
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
}
@@ -2888,10 +2967,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- percpu_counter_add_batch(
- &cache->space_info->total_bytes_pinned,
- num_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
+ __btrfs_mod_total_bytes_pinned(cache->space_info,
+ num_bytes);
set_extent_dirty(&trans->transaction->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
@@ -3336,6 +3413,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(list_empty(&block_group->io_list));
ASSERT(list_empty(&block_group->bg_list));
ASSERT(refcount_read(&block_group->refs) == 1);
+ ASSERT(block_group->swap_extents == 0);
btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock);
@@ -3402,3 +3480,26 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
__btrfs_remove_free_space_cache(block_group->free_space_ctl);
}
}
+
+bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
+{
+ bool ret = true;
+
+ spin_lock(&bg->lock);
+ if (bg->ro)
+ ret = false;
+ else
+ bg->swap_extents++;
+ spin_unlock(&bg->lock);
+
+ return ret;
+}
+
+void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
+{
+ spin_lock(&bg->lock);
+ ASSERT(!bg->ro);
+ ASSERT(bg->swap_extents >= amount);
+ bg->swap_extents -= amount;
+ spin_unlock(&bg->lock);
+}