From 92764e8822d4e7f8efb5ad959fac195a7f8ea0c6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 14 Aug 2024 21:38:21 +0100 Subject: netfs, ceph: Partially revert "netfs: Replace PG_fscache by setting folio->private and marking dirty" This partially reverts commit 2ff1e97587f4d398686f52c07afde3faf3da4e5c. In addition to reverting the removal of PG_private_2 wrangling from the buffered read code[1][2], the removal of the waits for PG_private_2 from netfs_release_folio() and netfs_invalidate_folio() need reverting too. It also adds a wait into ceph_evict_inode() to wait for netfs read and copy-to-cache ops to complete. Fixes: 2ff1e97587f4 ("netfs: Replace PG_fscache by setting folio->private and marking dirty") Signed-off-by: David Howells Link: https://lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk [1] Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8e5ced7804cb9184c4a23f8054551240562a8eda [2] Link: https://lore.kernel.org/r/20240814203850.2240469-2-dhowells@redhat.com cc: Max Kellermann cc: Ilya Dryomov cc: Xiubo Li cc: Jeff Layton cc: Matthew Wilcox cc: ceph-devel@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: Christian Brauner --- fs/ceph/inode.c | 1 + fs/netfs/misc.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 71cd70514efa..4a8eec46254b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -695,6 +695,7 @@ void ceph_evict_inode(struct inode *inode) percpu_counter_dec(&mdsc->metric.total_inodes); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_state & I_PINNING_NETFS_WB) ceph_fscache_unuse_cookie(inode, true); diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 83e644bd518f..554a1a4615ad 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -101,6 +101,8 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) _enter("{%lx},%zx,%zx", folio->index, offset, length); + folio_wait_private_2(folio); /* [DEPRECATED] */ + if (!folio_test_private(folio)) return; @@ -165,6 +167,11 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) if (folio_test_private(folio)) return false; + if (unlikely(folio_test_private_2(folio))) { /* [DEPRECATED] */ + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + folio_wait_private_2(folio); + } fscache_note_page_release(netfs_i_cookie(ctx)); return true; } -- cgit v1.2.3 From 524b2c6dc80d735be9ebcd2decffe2889baab65d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 15 Aug 2024 14:39:33 +0200 Subject: romfs: fix romfs_read_folio() Add the correct offset to folio_zero_tail(). Fixes: d86f2de026c5 ("romfs: Convert romfs_read_folio() to use a folio") Reported-by: Greg Ungerer Link: https://lore.kernel.org/r/Zr0GTnPHfeA0P8nb@casper.infradead.org Signed-off-by: Christian Brauner --- fs/romfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 68758b6fed94..0addcc849ff2 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -126,7 +126,7 @@ static int romfs_read_folio(struct file *file, struct folio *folio) } } - buf = folio_zero_tail(folio, fillsize, buf); + buf = folio_zero_tail(folio, fillsize, buf + fillsize); kunmap_local(buf); folio_end_read(folio, ret == 0); return ret; -- cgit v1.2.3 From 232590ea7fc125986a526e03081b98e5783f70d2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 19 Aug 2024 10:38:23 +0200 Subject: Revert "pidfd: prevent creation of pidfds for kthreads" This reverts commit 3b5bbe798b2451820e74243b738268f51901e7d0. Eric reported that systemd-shutdown gets broken by blocking the creating of pidfds for kthreads as older versions seems to rely on being able to create a pidfd for any process in /proc. Reported-by: Eric Biggers Link: https://lore.kernel.org/r/20240818035818.GA1929@sol.localdomain Signed-off-by: Christian Brauner --- kernel/fork.c | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 18bdc87209d0..cc760491f201 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2053,23 +2053,10 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) { - if (!pid) - return -EINVAL; - - scoped_guard(rcu) { - struct task_struct *tsk; - - if (flags & PIDFD_THREAD) - tsk = pid_task(pid, PIDTYPE_PID); - else - tsk = pid_task(pid, PIDTYPE_TGID); - if (!tsk) - return -EINVAL; + bool thread = flags & PIDFD_THREAD; - /* Don't create pidfds for kernel threads for now. */ - if (tsk->flags & PF_KTHREAD) - return -EINVAL; - } + if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) + return -EINVAL; return __pidfd_prepare(pid, flags, ret); } @@ -2416,12 +2403,6 @@ __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; - /* Don't create pidfds for kernel threads for now. */ - if (args->kthread) { - retval = -EINVAL; - goto bad_fork_free_pid; - } - /* Note that no task has been attached to @pid yet. */ retval = __pidfd_prepare(pid, flags, &pidfile); if (retval < 0) -- cgit v1.2.3 From 996b37da1e0f51314d4186b326742c2a95a9f0dd Mon Sep 17 00:00:00 2001 From: Ed Tsai Date: Mon, 8 Jul 2024 15:22:06 +0800 Subject: backing-file: convert to using fops->splice_write Filesystems may define their own splice write. Therefore, use the file fops instead of invoking iter_file_splice_write() directly. Signed-off-by: Ed Tsai Link: https://lore.kernel.org/r/20240708072208.25244-1-ed.tsai@mediatek.com Fixes: 5ca73468612d ("fuse: implement splice read/write passthrough") Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/backing-file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/backing-file.c b/fs/backing-file.c index afb557446c27..8860dac58c37 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -303,13 +303,16 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) return -EIO; + if (!out->f_op->splice_write) + return -EINVAL; + ret = file_remove_privs(ctx->user_file); if (ret) return ret; old_cred = override_creds(ctx->cred); file_start_write(out); - ret = iter_file_splice_write(pipe, out, ppos, len, flags); + ret = out->f_op->splice_write(pipe, out, ppos, len, flags); file_end_write(out); revert_creds(old_cred); -- cgit v1.2.3 From 7eff3453cbd7e0bfc7524d59694119b5ca844778 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 5 Jul 2024 09:15:08 +0800 Subject: ovl: pass string to ovl_parse_layer() So it can be used for parsing the Opt_lowerdir. Signed-off-by: Zhihao Cheng Link: https://lore.kernel.org/r/20240705011510.794025-2-chengzhihao1@huawei.com Signed-off-by: Christian Brauner --- fs/overlayfs/params.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 4860fcc4611b..52e3860973b7 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -365,10 +365,9 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, } } -static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param, - enum ovl_opt layer) +static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum ovl_opt layer) { - char *name = kstrdup(param->string, GFP_KERNEL); + char *name = kstrdup(layer_name, GFP_KERNEL); bool upper = (layer == Opt_upperdir || layer == Opt_workdir); struct path path; int err; @@ -582,7 +581,7 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_datadir_add: case Opt_upperdir: case Opt_workdir: - err = ovl_parse_layer(fc, param, opt); + err = ovl_parse_layer(fc, param->string, opt); break; case Opt_default_permissions: config->default_permissions = true; -- cgit v1.2.3 From ca76ac36bb6068866feca185045e7edf2a8f392f Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 5 Jul 2024 09:15:09 +0800 Subject: ovl: fix wrong lowerdir number check for parameter Opt_lowerdir The max count of lowerdir is OVL_MAX_STACK[500], which is broken by commit 37f32f526438("ovl: fix memory leak in ovl_parse_param()") for parameter Opt_lowerdir. Since commit 819829f0319a("ovl: refactor layer parsing helpers") and commit 24e16e385f22("ovl: add support for appending lowerdirs one by one") added check ovl_mount_dir_check() in function ovl_parse_param_lowerdir(), the 'ctx->nr' should be smaller than OVL_MAX_STACK, after commit 37f32f526438("ovl: fix memory leak in ovl_parse_param()") is applied, the 'ctx->nr' is updated before the check ovl_mount_dir_check(), which leads the max count of lowerdir to become 499 for parameter Opt_lowerdir. Fix it by replacing lower layers parsing code with the existing helper function ovl_parse_layer(). Fixes: 37f32f526438 ("ovl: fix memory leak in ovl_parse_param()") Signed-off-by: Zhihao Cheng Link: https://lore.kernel.org/r/20240705011510.794025-3-chengzhihao1@huawei.com Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/overlayfs/params.c | 40 +++++++--------------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 52e3860973b7..8dd834c7f291 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -353,6 +353,8 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer, case Opt_datadir_add: ctx->nr_data++; fallthrough; + case Opt_lowerdir: + fallthrough; case Opt_lowerdir_add: WARN_ON(ctx->nr >= ctx->capacity); l = &ctx->lower[ctx->nr++]; @@ -375,7 +377,7 @@ static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum o if (!name) return -ENOMEM; - if (upper) + if (upper || layer == Opt_lowerdir) err = ovl_mount_dir(name, &path); else err = ovl_mount_dir_noesc(name, &path); @@ -431,7 +433,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) { int err; struct ovl_fs_context *ctx = fc->fs_private; - struct ovl_fs_context_layer *l; char *dup = NULL, *iter; ssize_t nr_lower, nr; bool data_layer = false; @@ -471,35 +472,11 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) goto out_err; } - if (nr_lower > ctx->capacity) { - err = -ENOMEM; - l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower), - GFP_KERNEL_ACCOUNT); - if (!l) - goto out_err; - - ctx->lower = l; - ctx->capacity = nr_lower; - } - iter = dup; - l = ctx->lower; - for (nr = 0; nr < nr_lower; nr++, l++) { - ctx->nr++; - memset(l, 0, sizeof(*l)); - - err = ovl_mount_dir(iter, &l->path); + for (nr = 0; nr < nr_lower; nr++) { + err = ovl_parse_layer(fc, iter, Opt_lowerdir); if (err) - goto out_put; - - err = ovl_mount_dir_check(fc, &l->path, Opt_lowerdir, iter, false); - if (err) - goto out_put; - - err = -ENOMEM; - l->name = kstrdup(iter, GFP_KERNEL_ACCOUNT); - if (!l->name) - goto out_put; + goto out_err; if (data_layer) ctx->nr_data++; @@ -517,7 +494,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) */ if (ctx->nr_data > 0) { pr_err("regular lower layers cannot follow data lower layers"); - goto out_put; + goto out_err; } data_layer = false; @@ -531,9 +508,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) kfree(dup); return 0; -out_put: - ovl_reset_lowerdirs(ctx); - out_err: kfree(dup); -- cgit v1.2.3 From 441e36ef5b347d9ab4f54f7b54853266be687556 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 5 Jul 2024 09:15:10 +0800 Subject: ovl: ovl_parse_param_lowerdir: Add missed '\n' for pr_err Add '\n' for pr_err in function ovl_parse_param_lowerdir(), which ensures that error message is displayed at once. Fixes: b36a5780cb44 ("ovl: modify layer parameter parsing") Signed-off-by: Zhihao Cheng Link: https://lore.kernel.org/r/20240705011510.794025-4-chengzhihao1@huawei.com Signed-off-by: Christian Brauner --- fs/overlayfs/params.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 8dd834c7f291..d0568c091341 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -449,7 +449,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) return 0; if (*name == ':') { - pr_err("cannot append lower layer"); + pr_err("cannot append lower layer\n"); return -EINVAL; } @@ -493,7 +493,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) * there are no data layers. */ if (ctx->nr_data > 0) { - pr_err("regular lower layers cannot follow data lower layers"); + pr_err("regular lower layers cannot follow data lower layers\n"); goto out_err; } -- cgit v1.2.3 From 0aa2e1b2fb7a75aa4b5b4347055ccfea6f091769 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Aug 2024 21:08:09 +0100 Subject: mm: Fix missing folio invalidation calls during truncation When AS_RELEASE_ALWAYS is set on a mapping, the ->release_folio() and ->invalidate_folio() calls should be invoked even if PG_private and PG_private_2 aren't set. This is used by netfslib to keep track of the point above which reads can be skipped in favour of just zeroing pagecache locally. There are a couple of places in truncation in which invalidation is only called when folio_has_private() is true. Fix these to check folio_needs_release() instead. Without this, the generic/075 and generic/112 xfstests (both fsx-based tests) fail with minimum folio size patches applied[1]. Fixes: b4fa966f03b7 ("mm, netfs, fscache: stop read optimisation when folio removed from pagecache") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240815090849.972355-1-kernel@pankajraghav.com/ [1] Link: https://lore.kernel.org/r/20240823200819.532106-2-dhowells@redhat.com Reviewed-by: Matthew Wilcox (Oracle) cc: Matthew Wilcox (Oracle) cc: Pankaj Raghav cc: Jeff Layton cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-mm@kvack.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- mm/truncate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index 4d61fbdd4b2f..0668cd340a46 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -157,7 +157,7 @@ static void truncate_cleanup_folio(struct folio *folio) if (folio_mapped(folio)) unmap_mapping_folio(folio); - if (folio_has_private(folio)) + if (folio_needs_release(folio)) folio_invalidate(folio, 0, folio_size(folio)); /* @@ -219,7 +219,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) if (!mapping_inaccessible(folio->mapping)) folio_zero_range(folio, offset, length); - if (folio_has_private(folio)) + if (folio_needs_release(folio)) folio_invalidate(folio, offset, length); if (!folio_test_large(folio)) return true; -- cgit v1.2.3 From a74ee0e878e262c0276966528d72d4e887174410 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Aug 2024 21:08:10 +0100 Subject: afs: Fix post-setattr file edit to do truncation correctly At the end of an kAFS RPC operation, there is an "edit" phase (originally intended for post-directory modification ops to edit the local image) that the setattr VFS op uses to fix up the pagecache if the RPC that requested truncation of a file was successful. afs_setattr_edit_file() calls truncate_setsize() which sets i_size, expands the pagecache if needed and truncates the pagecache. The first two of those, however, are redundant as they've already been done by afs_setattr_success() under the io_lock and the first is also done under the callback lock (cb_lock). Fix afs_setattr_edit_file() to call truncate_pagecache() instead (which is called by truncate_setsize(), thereby skipping the redundant parts. Fixes: 100ccd18bb41 ("netfs: Optimise away reads above the point at which there can be no data") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240823200819.532106-3-dhowells@redhat.com cc: Matthew Wilcox (Oracle) cc: Pankaj Raghav cc: Jeff Layton cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-mm@kvack.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/afs/inode.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 3acf5e050072..a95e77670b49 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -695,13 +695,18 @@ static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_vnode *vnode = vp->vnode; + struct inode *inode = &vnode->netfs.inode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; - loff_t i_size = op->setattr.old_i_size; + loff_t old = op->setattr.old_i_size; + + /* Note: inode->i_size was updated by afs_apply_status() inside + * the I/O and callback locks. + */ - if (size != i_size) { - truncate_setsize(&vnode->netfs.inode, size); + if (size != old) { + truncate_pagecache(inode, size); netfs_resize_file(&vnode->netfs, size, true); fscache_resize_cookie(afs_vnode_cache(vnode), size); } -- cgit v1.2.3 From 7dfc8f0c6144c290dbeb01835a67e81b34dda8cd Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Aug 2024 21:08:11 +0100 Subject: netfs: Fix netfs_release_folio() to say no if folio dirty Fix netfs_release_folio() to say no (ie. return false) if the folio is dirty (analogous with iomap's behaviour). Without this, it will say yes to the release of a dirty page by split_huge_page_to_list_to_order(), which will result in the loss of untruncated data in the folio. Without this, the generic/075 and generic/112 xfstests (both fsx-based tests) fail with minimum folio size patches applied[1]. Fixes: c1ec4d7c2e13 ("netfs: Provide invalidate_folio and release_folio calls") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240815090849.972355-1-kernel@pankajraghav.com/ [1] Link: https://lore.kernel.org/r/20240823200819.532106-4-dhowells@redhat.com cc: Matthew Wilcox (Oracle) cc: Pankaj Raghav cc: Jeff Layton cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-mm@kvack.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/misc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 554a1a4615ad..69324761fcf7 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -161,6 +161,9 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); unsigned long long end; + if (folio_test_dirty(folio)) + return false; + end = folio_pos(folio) + folio_size(folio); if (end > ctx->zero_point) ctx->zero_point = end; -- cgit v1.2.3 From cce6bfa6ca0e30af9927b0074c97fe6a92f28092 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Aug 2024 21:08:12 +0100 Subject: netfs: Fix trimming of streaming-write folios in netfs_inval_folio() When netfslib writes to a folio that it doesn't have data for, but that data exists on the server, it will make a 'streaming write' whereby it stores data in a folio that is marked dirty, but not uptodate. When it does this, it attaches a record to folio->private to track the dirty region. When truncate() or fallocate() wants to invalidate part of such a folio, it will call into ->invalidate_folio(), specifying the part of the folio that is to be invalidated. netfs_invalidate_folio(), on behalf of the filesystem, must then determine how to trim the streaming write record. In a couple of cases, however, it does this incorrectly (the reduce-length and move-start cases are switched over and don't, in any case, calculate the value correctly). Fix this by making the logic tree more obvious and fixing the cases. Fixes: 9ebff83e6481 ("netfs: Prep to use folio->private for write grouping and streaming write") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240823200819.532106-5-dhowells@redhat.com cc: Matthew Wilcox (Oracle) cc: Pankaj Raghav cc: Jeff Layton cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-mm@kvack.org cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/misc.c | 50 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 69324761fcf7..c1f321cf5999 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -97,10 +97,20 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct netfs_folio *finfo; + struct netfs_inode *ctx = netfs_inode(folio_inode(folio)); size_t flen = folio_size(folio); _enter("{%lx},%zx,%zx", folio->index, offset, length); + if (offset == 0 && length == flen) { + unsigned long long i_size = i_size_read(&ctx->inode); + unsigned long long fpos = folio_pos(folio), end; + + end = umin(fpos + flen, i_size); + if (fpos < i_size && end > ctx->zero_point) + ctx->zero_point = end; + } + folio_wait_private_2(folio); /* [DEPRECATED] */ if (!folio_test_private(folio)) @@ -115,18 +125,34 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) /* We have a partially uptodate page from a streaming write. */ unsigned int fstart = finfo->dirty_offset; unsigned int fend = fstart + finfo->dirty_len; - unsigned int end = offset + length; + unsigned int iend = offset + length; if (offset >= fend) return; - if (end <= fstart) + if (iend <= fstart) + return; + + /* The invalidation region overlaps the data. If the region + * covers the start of the data, we either move along the start + * or just erase the data entirely. + */ + if (offset <= fstart) { + if (iend >= fend) + goto erase_completely; + /* Move the start of the data. */ + finfo->dirty_len = fend - iend; + finfo->dirty_offset = offset; + return; + } + + /* Reduce the length of the data if the invalidation region + * covers the tail part. + */ + if (iend >= fend) { + finfo->dirty_len = offset - fstart; return; - if (offset <= fstart && end >= fend) - goto erase_completely; - if (offset <= fstart && end > fstart) - goto reduce_len; - if (offset > fstart && end >= fend) - goto move_start; + } + /* A partial write was split. The caller has already zeroed * it, so just absorb the hole. */ @@ -139,12 +165,6 @@ erase_completely: folio_clear_uptodate(folio); kfree(finfo); return; -reduce_len: - finfo->dirty_len = offset + length - finfo->dirty_offset; - return; -move_start: - finfo->dirty_len -= offset - finfo->dirty_offset; - finfo->dirty_offset = offset; } EXPORT_SYMBOL(netfs_invalidate_folio); @@ -164,7 +184,7 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) if (folio_test_dirty(folio)) return false; - end = folio_pos(folio) + folio_size(folio); + end = umin(folio_pos(folio) + folio_size(folio), i_size_read(&ctx->inode)); if (end > ctx->zero_point) ctx->zero_point = end; -- cgit v1.2.3 From 950b03d0f664a54389a555d79215348ed413161f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Aug 2024 21:08:13 +0100 Subject: netfs: Fix missing iterator reset on retry of short read Fix netfs_rreq_perform_resubmissions() to reset before retrying a short read, otherwise the wrong part of the output buffer will be used. Fixes: 92b6cc5d1e7c ("netfs: Add iov_iters to (sub)requests to describe various buffers") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20240823200819.532106-6-dhowells@redhat.com cc: Steve French cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 5367caf3fa28..4da0a494e860 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -313,6 +313,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq) netfs_reset_subreq_iter(rreq, subreq); netfs_read_from_server(rreq, subreq); } else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) { + netfs_reset_subreq_iter(rreq, subreq); netfs_rreq_short_read(rreq, subreq); } } -- cgit v1.2.3 From e00e99ba6c6b8e5239e75cd6684a6827d93c39a2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 24 Aug 2024 12:56:53 +0100 Subject: netfs: Fix interaction of streaming writes with zero-point tracker When a folio that is marked for streaming write (dirty, but not uptodate, with partial content specified in the private data) is written back, the folio is effectively switched to the blank state upon completion of the write. This means that if we want to read it in future, we need to reread the whole folio. However, if the folio is above the zero_point position, when it is read back, it will just be cleared and the read skipped, leading to apparent local corruption. Fix this by increasing the zero_point to the end of the dirty data in the folio when clearing the folio state after writeback. This is analogous to the folio having ->release_folio() called upon it. This was causing the config.log generated by configuring a cpython tree on a cifs share to get corrupted because the scripts involved were appending text to the file in small pieces. Fixes: 288ace2f57c9 ("netfs: New writeback implementation") Signed-off-by: David Howells Link: https://lore.kernel.org/r/563286.1724500613@warthog.procyon.org.uk cc: Steve French cc: Paulo Alcantara cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/write_collect.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 426cf87aaf2e..ae7a2043f670 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -33,6 +33,7 @@ int netfs_folio_written_back(struct folio *folio) { enum netfs_folio_trace why = netfs_folio_trace_clear; + struct netfs_inode *ictx = netfs_inode(folio->mapping->host); struct netfs_folio *finfo; struct netfs_group *group = NULL; int gcount = 0; @@ -41,6 +42,12 @@ int netfs_folio_written_back(struct folio *folio) /* Streaming writes cannot be redirtied whilst under writeback, * so discard the streaming record. */ + unsigned long long fend; + + fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len; + if (fend > ictx->zero_point) + ictx->zero_point = fend; + folio_detach_private(folio); group = finfo->netfs_group; gcount++; -- cgit v1.2.3