From 453e4873869f5e967188d8b018efc34a57eed44f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:32 +0200 Subject: btrfs: rename btrfs_find_highest_objectid to btrfs_init_root_free_objectid This function is used to initialize the in-memory btrfs_root::highest_objectid member, which is used to get an available objectid. Rename it to better reflect its semantics. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6b35b7e88136..47646a79d3fc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1367,8 +1367,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) } mutex_lock(&root->objectid_mutex); - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); + ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); goto fail; @@ -2646,8 +2645,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user */ - ret = btrfs_find_highest_objectid(tree_root, - &tree_root->highest_objectid); + ret = btrfs_init_root_free_objectid(tree_root); if (ret < 0) { handle_error = true; continue; @@ -4745,7 +4743,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_init_root_free_objectid(struct btrfs_root *root) { struct btrfs_path *path; int ret; @@ -4769,10 +4767,10 @@ int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - *objectid = max_t(u64, found_key.objectid, + root->highest_objectid = max_t(u64, found_key.objectid, BTRFS_FIRST_FREE_OBJECTID - 1); } else { - *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + root->highest_objectid = BTRFS_FIRST_FREE_OBJECTID - 1; } ret = 0; error: -- cgit v1.2.3 From 543068a217a877bb6fa831fc448c9cc131db4feb Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:33 +0200 Subject: btrfs: rename btrfs_find_free_objectid to btrfs_get_free_objectid This better reflects the semantics of the function i.e no search is performed whatsoever. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 47646a79d3fc..98384d3e41ac 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4778,7 +4778,7 @@ error: return ret; } -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) { int ret; mutex_lock(&root->objectid_mutex); -- cgit v1.2.3 From 6b8fad576a3c8f822a888873c5acdfb31de53c4c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:35 +0200 Subject: btrfs: rename btrfs_root::highest_objectid to free_objectid This reflects the true purpose of the member as it's being used solely in context where a new objectid is being allocated. Future changes will also change the way it's being used to closely follow this semantics. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 98384d3e41ac..a5ae0bbd983e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1016,7 +1016,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->orphan_cleanup_state = 0; root->last_trans = 0; - root->highest_objectid = 0; + root->free_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; @@ -1373,7 +1373,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) goto fail; } - ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); mutex_unlock(&root->objectid_mutex); @@ -2651,7 +2651,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) continue; } - ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); ret = btrfs_read_roots(fs_info); if (ret < 0) { @@ -4767,10 +4767,10 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - root->highest_objectid = max_t(u64, found_key.objectid, + root->free_objectid = max_t(u64, found_key.objectid, BTRFS_FIRST_FREE_OBJECTID - 1); } else { - root->highest_objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + root->free_objectid = BTRFS_FIRST_FREE_OBJECTID - 1; } ret = 0; error: @@ -4783,7 +4783,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) int ret; mutex_lock(&root->objectid_mutex); - if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { btrfs_warn(root->fs_info, "the objectid of root %llu reaches its highest value", root->root_key.objectid); @@ -4791,7 +4791,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) goto out; } - *objectid = ++root->highest_objectid; + *objectid = ++root->free_objectid; ret = 0; out: mutex_unlock(&root->objectid_mutex); -- cgit v1.2.3 From 23125104d8485505cd19581025a3d6fc14e9945a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:36 +0200 Subject: btrfs: make btrfs_root::free_objectid hold the next available objectid Adjust the way free_objectid is being initialized, it now stores BTRFS_FIRST_FREE_OBJECTID rather than the, somewhat arbitrary, BTRFS_FIRST_FREE_OBJECTID - 1. This change also has the added benefit that now it becomes unnecessary to explicitly initialize free_objectid for a newly create fs root. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a5ae0bbd983e..5473bed6a7e8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4767,10 +4767,10 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - root->free_objectid = max_t(u64, found_key.objectid, - BTRFS_FIRST_FREE_OBJECTID - 1); + root->free_objectid = max_t(u64, found_key.objectid + 1, + BTRFS_FIRST_FREE_OBJECTID); } else { - root->free_objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } ret = 0; error: @@ -4791,7 +4791,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) goto out; } - *objectid = ++root->free_objectid; + *objectid = root->free_objectid++; ret = 0; out: mutex_unlock(&root->objectid_mutex); -- cgit v1.2.3 From 5deb17e18e27a3502f21581ba4d086e762b86b31 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:20 -0400 Subject: btrfs: track ordered bytes instead of just dio ordered bytes We track dio_bytes because the shrink delalloc code needs to know if we have more DIO in flight than we have normal buffered IO. The reason for this is because we can't "flush" DIO, we have to just wait on the ordered extents to finish. However this is true of all ordered extents. If we have more ordered space outstanding than dirty pages we should be waiting on ordered extents. We already are ok on this front technically, because we always do a FLUSH_DELALLOC_WAIT loop, but I want to use the ordered counter in the preemptive flushing code as well, so change this to count all ordered bytes instead of just DIO ordered bytes. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5473bed6a7e8..e0d56b3d1223 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1469,7 +1469,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->dio_bytes); + percpu_counter_destroy(&fs_info->ordered_bytes); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); @@ -2802,7 +2802,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); + ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL); if (ret) return ret; @@ -4163,9 +4163,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_sum(&fs_info->delalloc_bytes)); } - if (percpu_counter_sum(&fs_info->dio_bytes)) + if (percpu_counter_sum(&fs_info->ordered_bytes)) btrfs_info(fs_info, "at unmount dio bytes count %lld", - percpu_counter_sum(&fs_info->dio_bytes)); + percpu_counter_sum(&fs_info->ordered_bytes)); btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); -- cgit v1.2.3 From 576fa34830afac6a40cd19c777f1ab49c914e87c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:22 -0400 Subject: btrfs: improve preemptive background space flushing Currently if we ever have to flush space because we do not have enough we allocate a ticket and attach it to the space_info, and then systematically flush things in the filesystem that hold space reservations until our space is reclaimed. However this has a latency cost, we must go to sleep and wait for the flushing to make progress before we are woken up and allowed to continue doing our work. In order to address that we used to kick off the async worker to flush space preemptively, so that we could be reclaiming space hopefully before any tasks needed to stop and wait for space to reclaim. When I introduced the ticketed ENOSPC stuff this broke slightly in the fact that we were using tickets to indicate if we were done flushing. No tickets, no more flushing. However this meant that we essentially never preemptively flushed. This caused a write performance regression that Nikolay noticed in an unrelated patch that removed the committing of the transaction during btrfs_end_transaction. The behavior that happened pre that patch was btrfs_end_transaction() would see that we were low on space, and it would commit the transaction. This was bad because in this particular case you could end up with thousands and thousands of transactions being committed during the 5 minute reproducer. With the patch to remove this behavior we got much more sane transaction commits, but we ended up slower because we would write for a while, flush, write for a while, flush again. To address this we need to reinstate a preemptive flushing mechanism. However it is distinctly different from our ticketing flushing in that it doesn't have tickets to base it's decisions on. Instead of bolting this logic into our existing flushing work, add another worker to handle this preemptive flushing. Here we will attempt to be slightly intelligent about the things that we flushing, attempting to balance between whichever pool is taking up the most space. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e0d56b3d1223..e0d1b328397e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4111,6 +4111,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); + cancel_work_sync(&fs_info->preempt_reclaim_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); -- cgit v1.2.3 From 371cdc0700c778b94ae8fa2c7d99401f13070d8f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:59 +0800 Subject: btrfs: introduce subpage metadata validation check For subpage metadata validation check, there are some differences: - Read must finish in one bvec Since we're just reading one subpage range in one page, it should never be split into two bios nor two bvecs. - How to grab the existing eb Instead of grabbing eb using page->private, we have to go search radix tree as we don't have any direct pointer at hand. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e0d1b328397e..d34c6d61928f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -591,6 +591,59 @@ out: return ret; } +static int validate_subpage_buffer(struct page *page, u64 start, u64 end, + int mirror) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct extent_buffer *eb; + bool reads_done; + int ret = 0; + + /* + * We don't allow bio merge for subpage metadata read, so we should + * only get one eb for each endio hook. + */ + ASSERT(end == start + fs_info->nodesize - 1); + ASSERT(PagePrivate(page)); + + eb = find_extent_buffer(fs_info, start); + /* + * When we are reading one tree block, eb must have been inserted into + * the radix tree. If not, something is wrong. + */ + ASSERT(eb); + + reads_done = atomic_dec_and_test(&eb->io_pages); + /* Subpage read must finish in page read */ + ASSERT(reads_done); + + eb->read_mirror = mirror; + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { + ret = -EIO; + goto err; + } + ret = validate_extent_buffer(eb); + if (ret < 0) + goto err; + + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) + btree_readahead_hook(eb, ret); + + set_extent_buffer_uptodate(eb); + + free_extent_buffer(eb); + return ret; +err: + /* + * end_bio_extent_readpage decrements io_pages in case of error, + * make sure it has something to decrement. + */ + atomic_inc(&eb->io_pages); + clear_extent_buffer_uptodate(eb); + free_extent_buffer(eb); + return ret; +} + int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, struct page *page, u64 start, u64 end, int mirror) @@ -600,6 +653,10 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, int reads_done; ASSERT(page->private); + + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return validate_subpage_buffer(page, start, end, mirror); + eb = (struct extent_buffer *)page->private; /* -- cgit v1.2.3 From 0bb3eb3ee8674d5d20ad3c0c0767e18787bbd761 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:34:02 +0800 Subject: btrfs: allow read-only mount of 4K sector size fs on 64K page system This adds the basic RO mount ability for 4K sector size on 64K page system. Currently we only plan to support 4K and 64K page system. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d34c6d61928f..71fab77873a5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2483,13 +2483,21 @@ static int validate_super(struct btrfs_fs_info *fs_info, btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } - /* Only PAGE SIZE is supported yet */ - if (sectorsize != PAGE_SIZE) { + + /* + * For 4K page size, we only support 4K sector size. + * For 64K page size, we support read-write for 64K sector size, and + * read-only for 4K sector size. + */ + if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || + (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && + sectorsize != SZ_64K))) { btrfs_err(fs_info, - "sectorsize %llu not supported yet, only support %lu", + "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); ret = -EINVAL; } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid nodesize %llu", nodesize); @@ -3248,6 +3256,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } + /* For 4K sector size support, it's only read-only */ + if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) { + if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) { + btrfs_err(fs_info, + "subpage sectorsize %u only supported read-only for page size %lu", + sectorsize, PAGE_SIZE); + err = -EINVAL; + goto fail_alloc; + } + } + ret = btrfs_init_workqueues(fs_info, fs_devices); if (ret) { err = ret; -- cgit v1.2.3 From 7365104236ade0bf22edd7724c8fd438b0342ee4 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:42 +0900 Subject: btrfs: zoned: defer loading zone info after opening trees This is a preparation patch to implement zone emulation on a regular device. To emulate a zoned filesystem on a regular (non-zoned) device, we need to decide an emulated zone size. Instead of making it a compile-time static value, we'll make it configurable at mkfs time. Since we have one zone == one device extent restriction, we can determine the emulated zone size from the size of a device extent. We can extend btrfs_get_dev_zone_info() to show a regular device filled with conventional zones once the zone size is decided. The current call site of btrfs_get_dev_zone_info() during the mount process is earlier than loading the file system trees so that we don't know the size of a device extent at this point. Thus we can't slice a regular device to conventional zones. This patch introduces btrfs_get_dev_zone_info_all_devices to load the zone info for all the devices. And, it places this function in open_ctree() after loading the trees. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 71fab77873a5..2b6a3df765cd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3333,6 +3333,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (ret) goto fail_tree_roots; + /* + * Get zone type information of zoned block devices. This will also + * handle emulation of a zoned filesystem if a regular device has the + * zoned incompat feature flag set. + */ + ret = btrfs_get_dev_zone_info_all_devices(fs_info); + if (ret) { + btrfs_err(fs_info, + "zoned: failed to read device zone info: %d", + ret); + goto fail_block_groups; + } + /* * If we have a uuid root and we're not being told to rescan we need to * check the generation here so we can set the -- cgit v1.2.3 From b53429bad3a3555fdbda190192c6e9dfef8e7787 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:21:45 +0900 Subject: btrfs: zoned: do not load fs_info::zoned from incompat flag Don't set the zoned flag in fs_info as soon as we're encountering the incompat filesystem flag for a zoned filesystem on mount. The zoned flag in fs_info is in a union together with the zone_size, so setting it too early will result in setting an incorrect zone_size as well. Once the correct zone_size is read from the device, we can rely on the zoned flag in fs_info as well to determine if the filesystem is zoned. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2b6a3df765cd..8551b0fc1b22 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3201,8 +3201,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); - fs_info->zoned = (features & BTRFS_FEATURE_INCOMPAT_ZONED); - /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size -- cgit v1.2.3 From d3575156f6623eecf086a20bcf99a63f1598109c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:54 +0900 Subject: btrfs: zoned: redirty released extent buffers Tree manipulating operations like merging nodes often release once-allocated tree nodes. Such nodes are cleaned so that pages in the node are not uselessly written out. On zoned volumes, however, such optimization blocks the following IOs as the cancellation of the write out of the freed blocks breaks the sequential write sequence expected by the device. Introduce a list of clean and unwritten extent buffers that have been released in a transaction. Redirty the buffers so that btree_write_cache_pages() can send proper bios to the devices. Besides it clears the entire content of the extent buffer not to confuse raw block scanners e.g. 'btrfs check'. By clearing the content, csum_dirty_buffer() complains about bytenr mismatch, so avoid the checking and checksum using newly introduced buffer flag EXTENT_BUFFER_NO_CHECK. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8551b0fc1b22..eb1afd7d89f7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -459,6 +459,12 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec return 0; found_start = btrfs_header_bytenr(eb); + + if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { + WARN_ON(found_start != 0); + return 0; + } + /* * Please do not consolidate these warnings into a single if. * It is useful to know what went wrong. @@ -4774,6 +4780,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); + btrfs_free_redirty_list(cur_trans); + cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } -- cgit v1.2.3 From cfe94440d17404478771179150e6e4554f092dd5 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:59 +0900 Subject: btrfs: zoned: handle REQ_OP_ZONE_APPEND as writing Zoned filesystems use REQ_OP_ZONE_APPEND bios for writing to actual devices. Let btrfs_end_bio() and btrfs_op be aware of it, by mapping REQ_OP_ZONE_APPEND to BTRFS_MAP_WRITE and using btrfs_op() instead of bio_op(). Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eb1afd7d89f7..70621184a731 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -709,7 +709,7 @@ static void end_workqueue_bio(struct bio *bio) fs_info = end_io_wq->info; end_io_wq->status = bio->bi_status; - if (bio_op(bio) == REQ_OP_WRITE) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) wq = fs_info->endio_meta_write_workers; else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) @@ -885,7 +885,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; - if (bio_op(bio) != REQ_OP_WRITE) { + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads -- cgit v1.2.3 From 0bc09ca12980db3ef1e55bfad25b1803d57628c9 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:08 +0900 Subject: btrfs: zoned: serialize metadata IO We cannot use zone append for writing metadata, because the B-tree nodes have references to each other using logical address. Without knowing the address in advance, we cannot construct the tree in the first place. So we need to serialize write IOs for metadata. We cannot add a mutex around allocation and submission because metadata blocks are allocated in an earlier stage to build up B-trees. Add a zoned_meta_io_lock and hold it during metadata IO submission in btree_write_cache_pages() to serialize IOs. Furthermore, this adds a per-block group metadata IO submission pointer "meta_write_pointer" to ensure sequential writing, which can break when attempting to write back blocks in an unfinished transaction. If the writing out failed because of a hole and the write out is for data integrity (WB_SYNC_ALL), it returns EAGAIN. A caller like fsync() code should handle this properly e.g. by falling back to a full transaction commit. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 70621184a731..458bb27e0327 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2769,6 +2769,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); + mutex_init(&fs_info->zoned_meta_io_lock); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); -- cgit v1.2.3 From 4eef29ef6360d9c3e4be111392e20b70e19171cc Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:10 +0900 Subject: btrfs: zoned: do not use async metadata checksum on zoned filesystems On zoned filesystems, btrfs uses per-fs zoned_meta_io_lock to serialize the metadata write IOs. Even with this serialization, write bios sent from btree_write_cache_pages can be reordered by async checksum workers as these workers are per CPU and not per zone. To preserve write bio ordering, we disable async metadata checksum on a zoned filesystem. This does not result in lower performance with HDDs as a single CPU core is fast enough to do checksum for a single zone write stream with the maximum possible bandwidth of the device. If multiple zones are being written simultaneously, HDD seek overhead lowers the achievable maximum bandwidth, resulting again in a per zone checksum serialization not affecting the performance. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 458bb27e0327..6e16f556ed75 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -871,6 +871,8 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, static int check_async_write(struct btrfs_fs_info *fs_info, struct btrfs_inode *bi) { + if (btrfs_is_zoned(fs_info)) + return 0; if (atomic_read(&bi->sync_writers)) return 0; if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) -- cgit v1.2.3 From 6ab6ebb76042d3d94a7c6c447f770a28a412c68c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:17 +0900 Subject: btrfs: split alloc_log_tree() This is a preparation patch for the next patch. Split alloc_log_tree() into two parts. The first one allocating the tree structure, remains in alloc_log_tree() and the second part allocating the tree node, which is moved into btrfs_alloc_log_tree_node(). Also export the latter part is to be used in the next patch. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6e16f556ed75..d2fa92526b3b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1254,7 +1254,6 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct extent_buffer *leaf; root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); if (!root) @@ -1264,6 +1263,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + return root; +} + +int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct extent_buffer *leaf; + /* * DON'T set SHAREABLE bit for log trees. * @@ -1276,26 +1283,33 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); - if (IS_ERR(leaf)) { - btrfs_put_root(root); - return ERR_CAST(leaf); - } + if (IS_ERR(leaf)) + return PTR_ERR(leaf); root->node = leaf; btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); - return root; + + return 0; } int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *log_root; + int ret; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); + + ret = btrfs_alloc_log_tree_node(trans, log_root); + if (ret) { + btrfs_put_root(log_root); + return ret; + } + WARN_ON(fs_info->log_root_tree); fs_info->log_root_tree = log_root; return 0; @@ -1307,11 +1321,18 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log_root; struct btrfs_inode_item *inode_item; + int ret; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); + ret = btrfs_alloc_log_tree_node(trans, log_root); + if (ret) { + btrfs_put_root(log_root); + return ret; + } + log_root->last_trans = trans->transid; log_root->root_key.offset = root->root_key.objectid; -- cgit v1.2.3 From 40ab3be102f0a61dbb93093f330b432324a793f1 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:18 +0900 Subject: btrfs: zoned: extend zoned allocator to use dedicated tree-log block group This is the 1/3 patch to enable tree log on zoned filesystems. The tree-log feature does not work on a zoned filesystem as is. Blocks for a tree-log tree are allocated mixed with other metadata blocks and btrfs writes and syncs the tree-log blocks to devices at the time of fsync(), which has a different timing than a global transaction commit. As a result, both writing tree-log blocks and writing other metadata blocks become non-sequential writes that zoned filesystems must avoid. Introduce a dedicated block group for tree-log blocks, so that tree-log blocks and other metadata blocks can be separate write streams. As a result, each write stream can now be written to devices separately. "fs_info->treelog_bg" tracks the dedicated block group and assigns "treelog_bg" on-demand on tree-log block allocation time. This commit extends the zoned block allocator to use the block group. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d2fa92526b3b..84c6650d5ef7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2787,6 +2787,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); + spin_lock_init(&fs_info->treelog_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->delete_unused_bgs_mutex); -- cgit v1.2.3 From 3ddebf27fcd3a910989c85a3bfc9085225038c5b Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:20 +0900 Subject: btrfs: zoned: reorder log node allocation on zoned filesystem This is the 3/3 patch to enable tree-log on zoned filesystems. The allocation order of nodes of "fs_info->log_root_tree" and nodes of "root->log_root" is not the same as the writing order of them. So, the writing causes unaligned write errors. Reorder the allocation of them by delaying allocation of the root node of "fs_info->log_root_tree," so that the node buffers can go out sequentially to devices. Cc: Filipe Manana Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 84c6650d5ef7..c2576c5fe62e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1298,16 +1298,18 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *log_root; - int ret; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); - ret = btrfs_alloc_log_tree_node(trans, log_root); - if (ret) { - btrfs_put_root(log_root); - return ret; + if (!btrfs_is_zoned(fs_info)) { + int ret = btrfs_alloc_log_tree_node(trans, log_root); + + if (ret) { + btrfs_put_root(log_root); + return ret; + } } WARN_ON(fs_info->log_root_tree); -- cgit v1.2.3 From 83c68bbcb6ac2dbbcaf12e2281a29a9f73b97d0f Mon Sep 17 00:00:00 2001 From: Su Yue Date: Thu, 11 Feb 2021 16:38:28 +0800 Subject: btrfs: initialize fs_info::csum_size earlier in open_ctree User reported that btrfs-progs misc-tests/028-superblock-recover fails: [TEST/misc] 028-superblock-recover unexpected success: mounted fs with corrupted superblock test failed for case 028-superblock-recover The test case expects that a broken image with bad superblock will be rejected to be mounted. However, the test image just passed csum check of superblock and was successfully mounted. Commit 55fc29bed8dd ("btrfs: use cached value of fs_info::csum_size everywhere") replaces all calls to btrfs_super_csum_size by fs_info::csum_size. The calls include the place where fs_info->csum_size is not initialized. So btrfs_check_super_csum() passes because memcmp() with len 0 always returns 0. Fix it by caching csum size in btrfs_fs_info::csum_size once we know the csum type in superblock is valid in open_ctree(). Link: https://github.com/kdave/btrfs-progs/issues/250 Fixes: 55fc29bed8dd ("btrfs: use cached value of fs_info::csum_size everywhere") Signed-off-by: Su Yue Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/disk-io.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6b35b7e88136..07a2b4f69b10 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3044,6 +3044,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } + fs_info->csum_size = btrfs_super_csum_size(disk_super); + ret = btrfs_init_csum_hash(fs_info, csum_type); if (ret) { err = ret; @@ -3161,7 +3163,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); - fs_info->csum_size = btrfs_super_csum_size(disk_super); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; -- cgit v1.2.3