diff options
Diffstat (limited to 'fs')
69 files changed, 954 insertions, 806 deletions
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index f16f73581634..01338d4c2d9e 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -48,12 +48,17 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry) static void v9fs_dentry_release(struct dentry *dentry) { struct hlist_node *p, *n; + struct hlist_head head; p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n", dentry, dentry); - hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata) + + spin_lock(&dentry->d_lock); + hlist_move_list((struct hlist_head *)&dentry->d_fsdata, &head); + spin_unlock(&dentry->d_lock); + + hlist_for_each_safe(p, n, &head) p9_fid_put(hlist_entry(p, struct p9_fid, dlist)); - dentry->d_fsdata = NULL; } static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7a3308d77606..fd72fc38c8f5 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -348,6 +348,7 @@ void v9fs_evict_inode(struct inode *inode) __le32 __maybe_unused version; if (!is_bad_inode(inode)) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); version = cpu_to_le32(v9inode->qid.version); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 94fc049aff58..15bb7989c387 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -648,6 +648,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); afs_set_cache_aux(vnode, &aux); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 97f50e9fd9eb..297487ee8323 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -140,6 +140,11 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) put_page(page); if (ret < 0) return ret; + + /* Don't cross a backup volume mountpoint from a backup volume */ + if (src_as->volume && src_as->volume->type == AFSVL_BACKVOL && + ctx->type == AFSVL_BACKVOL) + return -ENODEV; } return 0; diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 692b1c7d5018..4321f9fb73bd 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -690,7 +690,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket_pos; + struct bpos bucket_pos = POS_MIN; struct bch_backpointer bp; if (p.ptr.cached) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index bc0ea2c4efef..2a538eb2af11 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -457,6 +457,7 @@ enum bch_time_stats { }; #include "alloc_types.h" +#include "btree_gc_types.h" #include "btree_types.h" #include "btree_node_scan_types.h" #include "btree_write_buffer_types.h" @@ -488,49 +489,6 @@ enum bch_time_stats { struct btree; -enum gc_phase { - GC_PHASE_NOT_RUNNING, - GC_PHASE_START, - GC_PHASE_SB, - - GC_PHASE_BTREE_stripes, - GC_PHASE_BTREE_extents, - GC_PHASE_BTREE_inodes, - GC_PHASE_BTREE_dirents, - GC_PHASE_BTREE_xattrs, - GC_PHASE_BTREE_alloc, - GC_PHASE_BTREE_quotas, - GC_PHASE_BTREE_reflink, - GC_PHASE_BTREE_subvolumes, - GC_PHASE_BTREE_snapshots, - GC_PHASE_BTREE_lru, - GC_PHASE_BTREE_freespace, - GC_PHASE_BTREE_need_discard, - GC_PHASE_BTREE_backpointers, - GC_PHASE_BTREE_bucket_gens, - GC_PHASE_BTREE_snapshot_trees, - GC_PHASE_BTREE_deleted_inodes, - GC_PHASE_BTREE_logged_ops, - GC_PHASE_BTREE_rebalance_work, - GC_PHASE_BTREE_subvolume_children, - - GC_PHASE_PENDING_DELETE, -}; - -struct gc_pos { - enum gc_phase phase; - u16 level; - struct bpos pos; -}; - -struct reflink_gc { - u64 offset; - u32 size; - u32 refcount; -}; - -typedef GENRADIX(struct reflink_gc) reflink_gc_table; - struct io_count { u64 sectors[2][BCH_DATA_NR]; }; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index d801e19cb489..90c12fe2a2cd 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -503,16 +503,22 @@ struct bch_sb_field { #include "alloc_background_format.h" #include "extents_format.h" -#include "reflink_format.h" #include "ec_format.h" -#include "inode_format.h" #include "dirent_format.h" -#include "xattr_format.h" -#include "quota_format.h" +#include "disk_groups_format.h" +#include "inode_format.h" +#include "journal_seq_blacklist_format.h" #include "logged_ops_format.h" +#include "quota_format.h" +#include "reflink_format.h" +#include "replicas_format.h" #include "snapshot_format.h" #include "subvolume_format.h" #include "sb-counters_format.h" +#include "sb-downgrade_format.h" +#include "sb-errors_format.h" +#include "sb-members_format.h" +#include "xattr_format.h" enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -545,107 +551,6 @@ struct bch_sb_field_journal_v2 { } d[]; }; -/* BCH_SB_FIELD_members_v1: */ - -#define BCH_MIN_NR_NBUCKETS (1 << 6) - -#define BCH_IOPS_MEASUREMENTS() \ - x(seqread, 0) \ - x(seqwrite, 1) \ - x(randread, 2) \ - x(randwrite, 3) - -enum bch_iops_measurement { -#define x(t, n) BCH_IOPS_##t = n, - BCH_IOPS_MEASUREMENTS() -#undef x - BCH_IOPS_NR -}; - -#define BCH_MEMBER_ERROR_TYPES() \ - x(read, 0) \ - x(write, 1) \ - x(checksum, 2) - -enum bch_member_error_type { -#define x(t, n) BCH_MEMBER_ERROR_##t = n, - BCH_MEMBER_ERROR_TYPES() -#undef x - BCH_MEMBER_ERROR_NR -}; - -struct bch_member { - __uuid_t uuid; - __le64 nbuckets; /* device size */ - __le16 first_bucket; /* index of first bucket used */ - __le16 bucket_size; /* sectors */ - __u8 btree_bitmap_shift; - __u8 pad[3]; - __le64 last_mount; /* time_t */ - - __le64 flags; - __le32 iops[4]; - __le64 errors[BCH_MEMBER_ERROR_NR]; - __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; - __le64 errors_reset_time; - __le64 seq; - __le64 btree_allocated_bitmap; - /* - * On recovery from a clean shutdown we don't normally read the journal, - * but we still want to resume writing from where we left off so we - * don't overwrite more than is necessary, for list journal debugging: - */ - __le32 last_journal_bucket; - __le32 last_journal_bucket_offset; -}; - -/* - * This limit comes from the bucket_gens array - it's a single allocation, and - * kernel allocation are limited to INT_MAX - */ -#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) - -#define BCH_MEMBER_V1_BYTES 56 - -LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) -/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ -LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) -LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) -LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) -LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) -LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, - struct bch_member, flags, 30, 31) - -#if 0 -LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -#endif - -#define BCH_MEMBER_STATES() \ - x(rw, 0) \ - x(ro, 1) \ - x(failed, 2) \ - x(spare, 3) - -enum bch_member_state { -#define x(t, n) BCH_MEMBER_STATE_##t = n, - BCH_MEMBER_STATES() -#undef x - BCH_MEMBER_STATE_NR -}; - -struct bch_sb_field_members_v1 { - struct bch_sb_field field; - struct bch_member _members[]; //Members are now variable size -}; - -struct bch_sb_field_members_v2 { - struct bch_sb_field field; - __le16 member_bytes; //size of single member entry - u8 pad[6]; - struct bch_member _members[]; -}; - /* BCH_SB_FIELD_crypt: */ struct nonce { @@ -694,8 +599,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -/* BCH_SB_FIELD_replicas: */ - #define BCH_DATA_TYPES() \ x(free, 0) \ x(sb, 1) \ @@ -738,50 +641,6 @@ static inline bool data_type_is_hidden(enum bch_data_type type) } } -struct bch_replicas_entry_v0 { - __u8 data_type; - __u8 nr_devs; - __u8 devs[]; -} __packed; - -struct bch_sb_field_replicas_v0 { - struct bch_sb_field field; - struct bch_replicas_entry_v0 entries[]; -} __packed __aligned(8); - -struct bch_replicas_entry_v1 { - __u8 data_type; - __u8 nr_devs; - __u8 nr_required; - __u8 devs[]; -} __packed; - -#define replicas_entry_bytes(_i) \ - (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) - -struct bch_sb_field_replicas { - struct bch_sb_field field; - struct bch_replicas_entry_v1 entries[]; -} __packed __aligned(8); - -/* BCH_SB_FIELD_disk_groups: */ - -#define BCH_SB_LABEL_SIZE 32 - -struct bch_disk_group { - __u8 label[BCH_SB_LABEL_SIZE]; - __le64 flags[2]; -} __packed __aligned(8); - -LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) -LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) - -struct bch_sb_field_disk_groups { - struct bch_sb_field field; - struct bch_disk_group entries[]; -} __packed __aligned(8); - /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: @@ -809,27 +668,6 @@ struct bch_sb_field_clean { __u64 _data[]; }; -struct journal_seq_blacklist_entry { - __le64 start; - __le64 end; -}; - -struct bch_sb_field_journal_seq_blacklist { - struct bch_sb_field field; - struct journal_seq_blacklist_entry start[]; -}; - -struct bch_sb_field_errors { - struct bch_sb_field field; - struct bch_sb_field_error_entry { - __le64 v; - __le64 last_error_time; - } entries[]; -}; - -LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); -LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); - struct bch_sb_field_ext { struct bch_sb_field field; __le64 recovery_passes_required[2]; @@ -837,18 +675,6 @@ struct bch_sb_field_ext { __le64 btrees_lost_data; }; -struct bch_sb_field_downgrade_entry { - __le16 version; - __le64 recovery_passes[2]; - __le16 nr_errors; - __le16 errors[] __counted_by(nr_errors); -} __packed __aligned(2); - -struct bch_sb_field_downgrade { - struct bch_sb_field field; - struct bch_sb_field_downgrade_entry entries[]; -}; - /* Superblock: */ /* @@ -909,7 +735,6 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) #define BCH_SB_SECTOR 8 -#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ #define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 8035c8b797ab..dc97991bcd6a 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -585,16 +585,17 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, bkey_version_in_future, - "key version number higher than recorded: %llu > %llu", - k.k->version.lo, - atomic64_read(&c->key_version))) + "key version number higher than recorded %llu\n %s", + atomic64_read(&c->key_version), + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) atomic64_set(&c->key_version, k.k->version.lo); } if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), c, btree_bitmap_not_marked, "btree ptr not marked in member info btree allocated bitmap\n %s", - (bch2_bkey_val_to_text(&buf, c, k), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { mutex_lock(&c->sb_lock); bch2_dev_btree_bitmap_mark(c, k); @@ -673,8 +674,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) { - return (int) btree_id_to_gc_phase(l) - - (int) btree_id_to_gc_phase(r); + return cmp_int(gc_btree_order(l), gc_btree_order(r)); } static int bch2_gc_btrees(struct bch_fs *c) @@ -711,7 +711,7 @@ fsck_err: static int bch2_mark_superblocks(struct bch_fs *c) { mutex_lock(&c->sb_lock); - gc_pos_set(c, gc_phase(GC_PHASE_SB)); + gc_pos_set(c, gc_phase(GC_PHASE_sb)); int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc); mutex_unlock(&c->sb_lock); @@ -1209,7 +1209,7 @@ int bch2_check_allocations(struct bch_fs *c) if (ret) goto out; - gc_pos_set(c, gc_phase(GC_PHASE_START)); + gc_pos_set(c, gc_phase(GC_PHASE_start)); ret = bch2_mark_superblocks(c); BUG_ON(ret); @@ -1231,7 +1231,7 @@ out: percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ - __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); bch2_gc_free(c); percpu_up_write(&c->mark_lock); diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 1b6489d8e0f4..876d81e2017d 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -3,6 +3,7 @@ #define _BCACHEFS_BTREE_GC_H #include "bkey.h" +#include "btree_gc_types.h" #include "btree_types.h" int bch2_check_topology(struct bch_fs *); @@ -32,36 +33,15 @@ int bch2_check_allocations(struct bch_fs *); /* Position of (the start of) a gc phase: */ static inline struct gc_pos gc_phase(enum gc_phase phase) { - return (struct gc_pos) { - .phase = phase, - .level = 0, - .pos = POS_MIN, - }; -} - -static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) -{ - return cmp_int(l.phase, r.phase) ?: - -cmp_int(l.level, r.level) ?: - bpos_cmp(l.pos, r.pos); -} - -static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) -{ - switch (id) { -#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; - BCH_BTREE_IDS() -#undef x - default: - BUG(); - } + return (struct gc_pos) { .phase = phase, }; } static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level, struct bpos pos) { return (struct gc_pos) { - .phase = btree_id_to_gc_phase(btree), + .phase = GC_PHASE_btree, + .btree = btree, .level = level, .pos = pos, }; @@ -76,6 +56,22 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b) return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p); } +static inline int gc_btree_order(enum btree_id btree) +{ + if (btree == BTREE_ID_stripes) + return -1; + return btree; +} + +static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) +{ + return cmp_int(l.phase, r.phase) ?: + cmp_int(gc_btree_order(l.btree), + gc_btree_order(r.btree)) ?: + -cmp_int(l.level, r.level) ?: + bpos_cmp(l.pos, r.pos); +} + static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) { unsigned seq; diff --git a/fs/bcachefs/btree_gc_types.h b/fs/bcachefs/btree_gc_types.h new file mode 100644 index 000000000000..b82c24bcc088 --- /dev/null +++ b/fs/bcachefs/btree_gc_types.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_GC_TYPES_H +#define _BCACHEFS_BTREE_GC_TYPES_H + +#include <linux/generic-radix-tree.h> + +enum gc_phase { + GC_PHASE_not_running, + GC_PHASE_start, + GC_PHASE_sb, + GC_PHASE_btree, +}; + +struct gc_pos { + enum gc_phase phase:8; + enum btree_id btree:8; + u16 level; + struct bpos pos; +}; + +struct reflink_gc { + u64 offset; + u32 size; + u32 refcount; +}; + +typedef GENRADIX(struct reflink_gc) reflink_gc_table; + +#endif /* _BCACHEFS_BTREE_GC_TYPES_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index cbf8f5d90602..829c1b91477d 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -519,7 +519,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca, - struct btree *b, struct bset *i, + struct btree *b, struct bset *i, struct bkey_packed *k, unsigned offset, int write) { prt_printf(out, bch2_log_msg(c, "%s"), @@ -537,15 +537,20 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, b->written, btree_ptr_sectors_written(&b->key)); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); + if (k) + prt_printf(out, " bset byte offset %lu", + (unsigned long)(void *)k - + ((unsigned long)(void *)i & ~511UL)); prt_str(out, ": "); } -__printf(9, 10) +__printf(10, 11) static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, + struct bkey_packed *k, int write, bool have_retry, enum bch_sb_error_id err_type, @@ -555,7 +560,7 @@ static int __btree_err(int ret, bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; va_list args; - btree_err_msg(&out, c, ca, b, i, b->written, write); + btree_err_msg(&out, c, ca, b, i, k, b->written, write); va_start(args, fmt); prt_vprintf(&out, fmt, args); @@ -611,9 +616,9 @@ fsck_err: return ret; } -#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \ +#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ ({ \ - int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \ + int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \ BCH_FSCK_ERR_##_err_type, \ msg, ##__VA_ARGS__); \ \ @@ -690,7 +695,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bch2_version_compatible(version), -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_unsupported_version, "unsupported bset version %u.%u", BCH_VERSION_MAJOR(version), @@ -698,7 +703,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(version < c->sb.version_min, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, NULL, btree_node_bset_older_than_sb_min, "bset version %u older than superblock version_min %u", version, c->sb.version_min)) { @@ -711,7 +716,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, if (btree_err_on(BCH_VERSION_MAJOR(version) > BCH_VERSION_MAJOR(c->sb.version), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, NULL, btree_node_bset_newer_than_sb, "bset version %u newer than superblock version %u", version, c->sb.version)) { @@ -723,13 +728,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(BSET_SEPARATE_WHITEOUTS(i), -BCH_ERR_btree_node_read_err_incompatible, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_unsupported_version, "BSET_SEPARATE_WHITEOUTS no longer supported"); if (btree_err_on(offset + sectors > btree_sectors(c), -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_past_end_of_btree_node, "bset past end of btree node")) { i->u64s = 0; @@ -739,13 +744,13 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(offset && !i->u64s, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_empty, "empty bset"); btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_wrong_sector_offset, "bset at wrong sector offset"); @@ -761,20 +766,20 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, /* XXX endianness */ btree_err_on(bp->seq != bn->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, bset_bad_seq, "incorrect sequence number (wrong btree node)"); } btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_btree, "incorrect btree id"); btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_level, "incorrect level"); @@ -793,7 +798,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_min_key, "incorrect min_key: got %s should be %s", (printbuf_reset(&buf1), @@ -804,7 +809,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_max_key, "incorrect max key %s", (printbuf_reset(&buf1), @@ -816,7 +821,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), -BCH_ERR_btree_node_read_err_bad_node, - c, ca, b, i, + c, ca, b, i, NULL, btree_node_bad_format, "invalid bkey format: %s\n %s", buf1.buf, (printbuf_reset(&buf2), @@ -883,7 +888,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, if (btree_err_on(bkey_p_next(k) > vstruct_last(i), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_past_bset_end, "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); @@ -892,14 +897,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, if (btree_err_on(k->format > KEY_FORMAT_CURRENT, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_bad_format, "invalid bkey format %u", k->format)) goto drop_this_key; if (btree_err_on(!bkeyp_u64s_valid(&b->format, k), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_bad_u64s, "bad k->u64s %u (min %u max %zu)", k->u64s, bkeyp_key_u64s(&b->format, k), @@ -921,7 +926,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_bkey_val_to_text(&buf, c, u.s_c); btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bad_bkey, "invalid bkey: %s", buf.buf); goto drop_this_key; @@ -942,7 +947,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, bch2_bkey_to_text(&buf, u.k); if (btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bkey_out_of_order, "%s", buf.buf)) goto drop_this_key; @@ -1011,13 +1016,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (bch2_meta_read_fault("btree")) btree_err(-BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_fault_injected, "dynamic fault"); btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_magic, "bad magic: want %llx, got %llx", bset_magic(c), le64_to_cpu(b->data->magic)); @@ -1032,7 +1037,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(b->data->keys.seq != bp->seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_seq, "got wrong btree node: got\n%s", (printbuf_reset(&buf), @@ -1041,7 +1046,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, } else { btree_err_on(!b->data->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bad_seq, "bad btree header: seq 0\n%s", (printbuf_reset(&buf), @@ -1060,7 +1065,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_unknown_csum, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -1073,7 +1078,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(csum_bad, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_bad_csum, "%s", (printbuf_reset(&buf), @@ -1088,7 +1093,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), -BCH_ERR_btree_node_read_err_incompatible, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_unsupported_version, "btree node does not have NEW_EXTENT_OVERWRITE set"); @@ -1102,7 +1107,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_unknown_csum, "unknown checksum type %llu", BSET_CSUM_TYPE(i)); @@ -1114,7 +1119,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(csum_bad, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, + c, ca, b, i, NULL, bset_bad_csum, "%s", (printbuf_reset(&buf), @@ -1152,14 +1157,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(blacklisted && first, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, bset_blacklisted_journal_seq, "first btree node bset has blacklisted journal seq (%llu)", le64_to_cpu(i->journal_seq)); btree_err_on(blacklisted && ptr_written, -BCH_ERR_btree_node_read_err_fixable, - c, ca, b, i, + c, ca, b, i, NULL, first_bset_blacklisted_journal_seq, "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", le64_to_cpu(i->journal_seq), @@ -1178,7 +1183,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (ptr_written) { btree_err_on(b->written < ptr_written, -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_data_missing, "btree node data missing: expected %u sectors, found %u", ptr_written, b->written); @@ -1191,7 +1196,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, le64_to_cpu(bne->keys.journal_seq), true), -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, NULL, + c, ca, b, NULL, NULL, btree_node_bset_after_end, "found bset signature after last bset"); } @@ -1235,7 +1240,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bch2_bkey_val_to_text(&buf, c, u.s_c); btree_err(-BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, i, + c, NULL, b, i, k, btree_node_bad_bkey, "%s", buf.buf); @@ -1471,18 +1476,18 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) written2 = btree_node_sectors_written(c, ra->buf[i]); if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_replicas_sectors_written_mismatch, "btree node sectors written mismatch: %u != %u", written, written2) || btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_bset_after_end, "found bset signature after last bset") || btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), -BCH_ERR_btree_node_read_err_fixable, - c, NULL, b, NULL, + c, NULL, b, NULL, NULL, btree_node_replicas_data_mismatch, "btree node replicas content mismatch")) dump_bset_maps = true; diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 75f5e6fe4634..34056aaece00 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -424,16 +424,16 @@ static int btree_key_cache_fill(struct btree_trans *trans, goto err; } - if (!bch2_btree_node_relock(trans, ck_path, 0)) { + ret = bch2_trans_relock(trans); + if (ret) { kfree(new_k); - trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); goto err; } - ret = bch2_trans_relock(trans); - if (ret) { + if (!bch2_btree_node_relock(trans, ck_path, 0)) { kfree(new_k); + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); goto err; } } diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index c3e9b0cc7bbd..d66fff22109a 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -215,6 +215,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) if (unlikely(!best)) { struct printbuf buf = PRINTBUF; + buf.atomic++; prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index b469586517a8..ed97712d0db1 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1134,7 +1134,7 @@ static int __trigger_extent(struct btree_trans *trans, r.e.nr_required = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors; + s64 disk_sectors = 0; ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags); if (ret < 0) return ret; diff --git a/fs/bcachefs/disk_groups_format.h b/fs/bcachefs/disk_groups_format.h new file mode 100644 index 000000000000..698990bbf1d2 --- /dev/null +++ b/fs/bcachefs/disk_groups_format.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H +#define _BCACHEFS_DISK_GROUPS_FORMAT_H + +#define BCH_SB_LABEL_SIZE 32 + +struct bch_disk_group { + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 flags[2]; +} __packed __aligned(8); + +LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) +LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) +LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) + +struct bch_sb_field_disk_groups { + struct bch_sb_field field; + struct bch_disk_group entries[]; +} __packed __aligned(8); + +#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index b26dc7424662..d8b9beca3776 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -908,7 +908,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && + if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 6b69e5cd68dd..54873ecc635c 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -437,8 +437,8 @@ static void bch2_writepage_io_done(struct bch_write_op *op) */ /* - * PageWriteback is effectively our ref on the inode - fixup i_blocks - * before calling end_page_writeback: + * The writeback flag is effectively our ref on the inode - + * fixup i_blocks before calling folio_end_writeback: */ bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); @@ -898,7 +898,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, darray_for_each(fs, fi) { f = *fi; f_len = min(end, folio_end_pos(f)) - f_pos; - f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); + f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter); if (!f_copied) { folios_trunc(&fs, fi); break; diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 09d21aef879a..049b61bc9a5b 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -609,8 +609,10 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) if (unlikely(ret)) goto err_put_write_ref; - if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) + if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) { + ret = -EINVAL; goto err_put_write_ref; + } inode_dio_begin(&inode->v); bch2_pagecache_block_get(inode); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 96040a95cf46..cd388f1702dc 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1939,8 +1939,7 @@ got_sb: if (IS_ERR(sb)) { ret = PTR_ERR(sb); - ret = bch2_err_class(ret); - return ERR_PTR(ret); + goto err; } c = sb->s_fs_info; @@ -2016,6 +2015,15 @@ out: err_put_super: __bch2_fs_stop(c); deactivate_locked_super(sb); +err: + /* + * On an inconsistency error in recovery we might see an -EROFS derived + * errorcode (from the journal), but we don't want to return that to + * userspace as that causes util-linux to retry the mount RO - which is + * confusing: + */ + if (bch2_err_matches(ret, EROFS) && ret != -EROFS) + ret = -EIO; return ERR_PTR(bch2_err_class(ret)); } diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index c8f57465131c..fd277bd58ed3 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -77,21 +77,17 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, - POS(0, inode_nr), - BTREE_ITER_all_snapshots); - k = bch2_btree_iter_peek(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) { - ret = -BCH_ERR_ENOENT_inode; - goto err; + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode_nr) + break; + if (!bkey_is_inode(k.k)) + continue; + ret = bch2_inode_unpack(k, inode); + goto found; } - - ret = bch2_inode_unpack(k, inode); -err: + ret = -BCH_ERR_ENOENT_inode; +found: bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); bch2_trans_iter_exit(trans, &iter); return ret; @@ -770,25 +766,6 @@ static int get_visible_inodes(struct btree_trans *trans, return ret; } -static int check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, - bkey_in_missing_snapshot, - "key in missing snapshot: %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; -fsck_err: - printbuf_exit(&buf); - return ret; -} - static int hash_redo_key(struct btree_trans *trans, const struct bch_hash_desc desc, struct bch_hash_info *hash_info, @@ -983,7 +960,7 @@ static int check_inode(struct btree_trans *trans, bool do_update = false; int ret; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) goto err; if (ret) @@ -1487,7 +1464,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, struct printbuf buf = PRINTBUF; int ret = 0; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret) { ret = ret < 0 ? ret : 0; goto out; @@ -2010,7 +1987,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct printbuf buf = PRINTBUF; int ret = 0; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret) { ret = ret < 0 ? ret : 0; goto out; @@ -2165,7 +2142,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, struct inode_walker_entry *i; int ret; - ret = check_key_has_snapshot(trans, iter, k); + ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) return ret; if (ret) diff --git a/fs/bcachefs/journal_seq_blacklist_format.h b/fs/bcachefs/journal_seq_blacklist_format.h new file mode 100644 index 000000000000..2566b12dbc04 --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist_format.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H +#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H + +struct journal_seq_blacklist_entry { + __le64 start; + __le64 end; +}; + +struct bch_sb_field_journal_seq_blacklist { + struct bch_sb_field field; + struct journal_seq_blacklist_entry start[]; +}; + +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */ diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index 4c298e74723d..e9d9c0212e44 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -217,4 +217,5 @@ static struct kunit_suite mean_and_variance_test_suite = { kunit_test_suite(mean_and_variance_test_suite); MODULE_AUTHOR("Daniel B. Hill"); +MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests"); MODULE_LICENSE("GPL"); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 8171f947fac8..6e477fadaa2a 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -547,6 +547,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, ctxt->stats->pos = BBPOS(btree_id, start); } + bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, btree_id, start, BTREE_ITER_prefetch| BTREE_ITER_all_snapshots); @@ -920,7 +921,20 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, ? c->opts.metadata_replicas : io_opts->data_replicas; - if (!nr_good || nr_good >= replicas) + rcu_read_lock(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned i = 0; + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ptr->cached && + (!ca || !ca->mi.durability)) + data_opts->kill_ptrs |= BIT(i); + i++; + } + rcu_read_unlock(); + + if (!data_opts->kill_ptrs && + (!nr_good || nr_good >= replicas)) return false; data_opts->target = 0; diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h new file mode 100644 index 000000000000..b97208195d06 --- /dev/null +++ b/fs/bcachefs/replicas_format.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REPLICAS_FORMAT_H +#define _BCACHEFS_REPLICAS_FORMAT_H + +struct bch_replicas_entry_v0 { + __u8 data_type; + __u8 nr_devs; + __u8 devs[]; +} __packed; + +struct bch_sb_field_replicas_v0 { + struct bch_sb_field field; + struct bch_replicas_entry_v0 entries[]; +} __packed __aligned(8); + +struct bch_replicas_entry_v1 { + __u8 data_type; + __u8 nr_devs; + __u8 nr_required; + __u8 devs[]; +} __packed; + +struct bch_sb_field_replicas { + struct bch_sb_field field; + struct bch_replicas_entry_v1 entries[]; +} __packed __aligned(8); + +#define replicas_entry_bytes(_i) \ + (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) + +#endif /* _BCACHEFS_REPLICAS_FORMAT_H */ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 390a1bbd2567..3fb23e399ffb 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -146,10 +146,17 @@ static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f, for (const struct bch_sb_field_downgrade_entry *i = e->entries; (void *) i < vstruct_end(&e->field); i = downgrade_entry_next_c(i)) { + /* + * Careful: sb_field_downgrade_entry is only 2 byte aligned, but + * section sizes are 8 byte aligned - an empty entry spanning + * the end of the section is allowed (and ignored): + */ + if ((void *) &i->errors[0] > vstruct_end(&e->field)) + break; + if (flags & BCH_VALIDATE_write && - ((void *) &i->errors[0] > vstruct_end(&e->field) || - (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field))) { - prt_printf(err, "downgrade entry overruns end of superblock section)"); + (void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) { + prt_printf(err, "downgrade entry overruns end of superblock section"); return -BCH_ERR_invalid_sb_downgrade; } diff --git a/fs/bcachefs/sb-downgrade_format.h b/fs/bcachefs/sb-downgrade_format.h new file mode 100644 index 000000000000..cffd932be3ec --- /dev/null +++ b/fs/bcachefs/sb-downgrade_format.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H +#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H + +struct bch_sb_field_downgrade_entry { + __le16 version; + __le64 recovery_passes[2]; + __le16 nr_errors; + __le16 errors[] __counted_by(nr_errors); +} __packed __aligned(2); + +struct bch_sb_field_downgrade { + struct bch_sb_field field; + struct bch_sb_field_downgrade_entry entries[]; +}; + +#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h new file mode 100644 index 000000000000..84d2763bd597 --- /dev/null +++ b/fs/bcachefs/sb-errors_format.h @@ -0,0 +1,296 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_ERRORS_FORMAT_H +#define _BCACHEFS_SB_ERRORS_FORMAT_H + +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0) \ + x(dirty_but_no_journal_entries, 1) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ + x(sb_clean_journal_seq_mismatch, 3) \ + x(sb_clean_btree_root_mismatch, 4) \ + x(sb_clean_missing, 5) \ + x(jset_unsupported_version, 6) \ + x(jset_unknown_csum, 7) \ + x(jset_last_seq_newer_than_seq, 8) \ + x(jset_past_bucket_end, 9) \ + x(jset_seq_blacklisted, 10) \ + x(journal_entries_missing, 11) \ + x(journal_entry_replicas_not_marked, 12) \ + x(journal_entry_past_jset_end, 13) \ + x(journal_entry_replicas_data_mismatch, 14) \ + x(journal_entry_bkey_u64s_0, 15) \ + x(journal_entry_bkey_past_end, 16) \ + x(journal_entry_bkey_bad_format, 17) \ + x(journal_entry_bkey_invalid, 18) \ + x(journal_entry_btree_root_bad_size, 19) \ + x(journal_entry_blacklist_bad_size, 20) \ + x(journal_entry_blacklist_v2_bad_size, 21) \ + x(journal_entry_blacklist_v2_start_past_end, 22) \ + x(journal_entry_usage_bad_size, 23) \ + x(journal_entry_data_usage_bad_size, 24) \ + x(journal_entry_clock_bad_size, 25) \ + x(journal_entry_clock_bad_rw, 26) \ + x(journal_entry_dev_usage_bad_size, 27) \ + x(journal_entry_dev_usage_bad_dev, 28) \ + x(journal_entry_dev_usage_bad_pad, 29) \ + x(btree_node_unreadable, 30) \ + x(btree_node_fault_injected, 31) \ + x(btree_node_bad_magic, 32) \ + x(btree_node_bad_seq, 33) \ + x(btree_node_unsupported_version, 34) \ + x(btree_node_bset_older_than_sb_min, 35) \ + x(btree_node_bset_newer_than_sb, 36) \ + x(btree_node_data_missing, 37) \ + x(btree_node_bset_after_end, 38) \ + x(btree_node_replicas_sectors_written_mismatch, 39) \ + x(btree_node_replicas_data_mismatch, 40) \ + x(bset_unknown_csum, 41) \ + x(bset_bad_csum, 42) \ + x(bset_past_end_of_btree_node, 43) \ + x(bset_wrong_sector_offset, 44) \ + x(bset_empty, 45) \ + x(bset_bad_seq, 46) \ + x(bset_blacklisted_journal_seq, 47) \ + x(first_bset_blacklisted_journal_seq, 48) \ + x(btree_node_bad_btree, 49) \ + x(btree_node_bad_level, 50) \ + x(btree_node_bad_min_key, 51) \ + x(btree_node_bad_max_key, 52) \ + x(btree_node_bad_format, 53) \ + x(btree_node_bkey_past_bset_end, 54) \ + x(btree_node_bkey_bad_format, 55) \ + x(btree_node_bad_bkey, 56) \ + x(btree_node_bkey_out_of_order, 57) \ + x(btree_root_bkey_invalid, 58) \ + x(btree_root_read_error, 59) \ + x(btree_root_bad_min_key, 60) \ + x(btree_root_bad_max_key, 61) \ + x(btree_node_read_error, 62) \ + x(btree_node_topology_bad_min_key, 63) \ + x(btree_node_topology_bad_max_key, 64) \ + x(btree_node_topology_overwritten_by_prev_node, 65) \ + x(btree_node_topology_overwritten_by_next_node, 66) \ + x(btree_node_topology_interior_node_empty, 67) \ + x(fs_usage_hidden_wrong, 68) \ + x(fs_usage_btree_wrong, 69) \ + x(fs_usage_data_wrong, 70) \ + x(fs_usage_cached_wrong, 71) \ + x(fs_usage_reserved_wrong, 72) \ + x(fs_usage_persistent_reserved_wrong, 73) \ + x(fs_usage_nr_inodes_wrong, 74) \ + x(fs_usage_replicas_wrong, 75) \ + x(dev_usage_buckets_wrong, 76) \ + x(dev_usage_sectors_wrong, 77) \ + x(dev_usage_fragmented_wrong, 78) \ + x(dev_usage_buckets_ec_wrong, 79) \ + x(bkey_version_in_future, 80) \ + x(bkey_u64s_too_small, 81) \ + x(bkey_invalid_type_for_btree, 82) \ + x(bkey_extent_size_zero, 83) \ + x(bkey_extent_size_greater_than_offset, 84) \ + x(bkey_size_nonzero, 85) \ + x(bkey_snapshot_nonzero, 86) \ + x(bkey_snapshot_zero, 87) \ + x(bkey_at_pos_max, 88) \ + x(bkey_before_start_of_btree_node, 89) \ + x(bkey_after_end_of_btree_node, 90) \ + x(bkey_val_size_nonzero, 91) \ + x(bkey_val_size_too_small, 92) \ + x(alloc_v1_val_size_bad, 93) \ + x(alloc_v2_unpack_error, 94) \ + x(alloc_v3_unpack_error, 95) \ + x(alloc_v4_val_size_bad, 96) \ + x(alloc_v4_backpointers_start_bad, 97) \ + x(alloc_key_data_type_bad, 98) \ + x(alloc_key_empty_but_have_data, 99) \ + x(alloc_key_dirty_sectors_0, 100) \ + x(alloc_key_data_type_inconsistency, 101) \ + x(alloc_key_to_missing_dev_bucket, 102) \ + x(alloc_key_cached_inconsistency, 103) \ + x(alloc_key_cached_but_read_time_zero, 104) \ + x(alloc_key_to_missing_lru_entry, 105) \ + x(alloc_key_data_type_wrong, 106) \ + x(alloc_key_gen_wrong, 107) \ + x(alloc_key_dirty_sectors_wrong, 108) \ + x(alloc_key_cached_sectors_wrong, 109) \ + x(alloc_key_stripe_wrong, 110) \ + x(alloc_key_stripe_redundancy_wrong, 111) \ + x(bucket_sector_count_overflow, 112) \ + x(bucket_metadata_type_mismatch, 113) \ + x(need_discard_key_wrong, 114) \ + x(freespace_key_wrong, 115) \ + x(freespace_hole_missing, 116) \ + x(bucket_gens_val_size_bad, 117) \ + x(bucket_gens_key_wrong, 118) \ + x(bucket_gens_hole_wrong, 119) \ + x(bucket_gens_to_invalid_dev, 120) \ + x(bucket_gens_to_invalid_buckets, 121) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ + x(need_discard_freespace_key_bad, 124) \ + x(backpointer_bucket_offset_wrong, 125) \ + x(backpointer_to_missing_device, 126) \ + x(backpointer_to_missing_alloc, 127) \ + x(backpointer_to_missing_ptr, 128) \ + x(lru_entry_at_time_0, 129) \ + x(lru_entry_to_invalid_bucket, 130) \ + x(lru_entry_bad, 131) \ + x(btree_ptr_val_too_big, 132) \ + x(btree_ptr_v2_val_too_big, 133) \ + x(btree_ptr_has_non_ptr, 134) \ + x(extent_ptrs_invalid_entry, 135) \ + x(extent_ptrs_no_ptrs, 136) \ + x(extent_ptrs_too_many_ptrs, 137) \ + x(extent_ptrs_redundant_crc, 138) \ + x(extent_ptrs_redundant_stripe, 139) \ + x(extent_ptrs_unwritten, 140) \ + x(extent_ptrs_written_and_unwritten, 141) \ + x(ptr_to_invalid_device, 142) \ + x(ptr_to_duplicate_device, 143) \ + x(ptr_after_last_bucket, 144) \ + x(ptr_before_first_bucket, 145) \ + x(ptr_spans_multiple_buckets, 146) \ + x(ptr_to_missing_backpointer, 147) \ + x(ptr_to_missing_alloc_key, 148) \ + x(ptr_to_missing_replicas_entry, 149) \ + x(ptr_to_missing_stripe, 150) \ + x(ptr_to_incorrect_stripe, 151) \ + x(ptr_gen_newer_than_bucket_gen, 152) \ + x(ptr_too_stale, 153) \ + x(stale_dirty_ptr, 154) \ + x(ptr_bucket_data_type_mismatch, 155) \ + x(ptr_cached_and_erasure_coded, 156) \ + x(ptr_crc_uncompressed_size_too_small, 157) \ + x(ptr_crc_csum_type_unknown, 158) \ + x(ptr_crc_compression_type_unknown, 159) \ + x(ptr_crc_redundant, 160) \ + x(ptr_crc_uncompressed_size_too_big, 161) \ + x(ptr_crc_nonce_mismatch, 162) \ + x(ptr_stripe_redundant, 163) \ + x(reservation_key_nr_replicas_invalid, 164) \ + x(reflink_v_refcount_wrong, 165) \ + x(reflink_p_to_missing_reflink_v, 166) \ + x(stripe_pos_bad, 167) \ + x(stripe_val_size_bad, 168) \ + x(stripe_sector_count_wrong, 169) \ + x(snapshot_tree_pos_bad, 170) \ + x(snapshot_tree_to_missing_snapshot, 171) \ + x(snapshot_tree_to_missing_subvol, 172) \ + x(snapshot_tree_to_wrong_subvol, 173) \ + x(snapshot_tree_to_snapshot_subvol, 174) \ + x(snapshot_pos_bad, 175) \ + x(snapshot_parent_bad, 176) \ + x(snapshot_children_not_normalized, 177) \ + x(snapshot_child_duplicate, 178) \ + x(snapshot_child_bad, 179) \ + x(snapshot_skiplist_not_normalized, 180) \ + x(snapshot_skiplist_bad, 181) \ + x(snapshot_should_not_have_subvol, 182) \ + x(snapshot_to_bad_snapshot_tree, 183) \ + x(snapshot_bad_depth, 184) \ + x(snapshot_bad_skiplist, 185) \ + x(subvol_pos_bad, 186) \ + x(subvol_not_master_and_not_snapshot, 187) \ + x(subvol_to_missing_root, 188) \ + x(subvol_root_wrong_bi_subvol, 189) \ + x(bkey_in_missing_snapshot, 190) \ + x(inode_pos_inode_nonzero, 191) \ + x(inode_pos_blockdev_range, 192) \ + x(inode_unpack_error, 193) \ + x(inode_str_hash_invalid, 194) \ + x(inode_v3_fields_start_bad, 195) \ + x(inode_snapshot_mismatch, 196) \ + x(inode_unlinked_but_clean, 197) \ + x(inode_unlinked_but_nlink_nonzero, 198) \ + x(inode_checksum_type_invalid, 199) \ + x(inode_compression_type_invalid, 200) \ + x(inode_subvol_root_but_not_dir, 201) \ + x(inode_i_size_dirty_but_clean, 202) \ + x(inode_i_sectors_dirty_but_clean, 203) \ + x(inode_i_sectors_wrong, 204) \ + x(inode_dir_wrong_nlink, 205) \ + x(inode_dir_multiple_links, 206) \ + x(inode_multiple_links_but_nlink_0, 207) \ + x(inode_wrong_backpointer, 208) \ + x(inode_wrong_nlink, 209) \ + x(inode_unreachable, 210) \ + x(deleted_inode_but_clean, 211) \ + x(deleted_inode_missing, 212) \ + x(deleted_inode_is_dir, 213) \ + x(deleted_inode_not_unlinked, 214) \ + x(extent_overlapping, 215) \ + x(extent_in_missing_inode, 216) \ + x(extent_in_non_reg_inode, 217) \ + x(extent_past_end_of_inode, 218) \ + x(dirent_empty_name, 219) \ + x(dirent_val_too_big, 220) \ + x(dirent_name_too_long, 221) \ + x(dirent_name_embedded_nul, 222) \ + x(dirent_name_dot_or_dotdot, 223) \ + x(dirent_name_has_slash, 224) \ + x(dirent_d_type_wrong, 225) \ + x(inode_bi_parent_wrong, 226) \ + x(dirent_in_missing_dir_inode, 227) \ + x(dirent_in_non_dir_inode, 228) \ + x(dirent_to_missing_inode, 229) \ + x(dirent_to_missing_subvol, 230) \ + x(dirent_to_itself, 231) \ + x(quota_type_invalid, 232) \ + x(xattr_val_size_too_small, 233) \ + x(xattr_val_size_too_big, 234) \ + x(xattr_invalid_type, 235) \ + x(xattr_name_invalid_chars, 236) \ + x(xattr_in_missing_inode, 237) \ + x(root_subvol_missing, 238) \ + x(root_dir_missing, 239) \ + x(root_inode_not_dir, 240) \ + x(dir_loop, 241) \ + x(hash_table_key_duplicate, 242) \ + x(hash_table_key_wrong_offset, 243) \ + x(unlinked_inode_not_on_deleted_list, 244) \ + x(reflink_p_front_pad_bad, 245) \ + x(journal_entry_dup_same_device, 246) \ + x(inode_bi_subvol_missing, 247) \ + x(inode_bi_subvol_wrong, 248) \ + x(inode_points_to_missing_dirent, 249) \ + x(inode_points_to_wrong_dirent, 250) \ + x(inode_bi_parent_nonzero, 251) \ + x(dirent_to_missing_parent_subvol, 252) \ + x(dirent_not_visible_in_parent_subvol, 253) \ + x(subvol_fs_path_parent_wrong, 254) \ + x(subvol_root_fs_path_parent_nonzero, 255) \ + x(subvol_children_not_set, 256) \ + x(subvol_children_bad, 257) \ + x(subvol_loop, 258) \ + x(subvol_unreachable, 259) \ + x(btree_node_bkey_bad_u64s, 260) \ + x(btree_node_topology_empty_interior_node, 261) \ + x(btree_ptr_v2_min_key_bad, 262) \ + x(btree_root_unreadable_and_scan_found_nothing, 263) \ + x(snapshot_node_missing, 264) \ + x(dup_backpointer_to_bad_csum_extent, 265) \ + x(btree_bitmap_not_marked, 266) \ + x(sb_clean_entry_overrun, 267) \ + x(btree_ptr_v2_written_0, 268) \ + x(subvol_snapshot_bad, 269) \ + x(subvol_inode_bad, 270) + +enum bch_sb_error_id { +#define x(t, n) BCH_FSCK_ERR_##t = n, + BCH_SB_ERRS() +#undef x + BCH_SB_ERR_MAX +}; + +struct bch_sb_field_errors { + struct bch_sb_field field; + struct bch_sb_field_error_entry { + __le64 v; + __le64 last_error_time; + } entries[]; +}; + +LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16); +LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); + +#endif /* _BCACHEFS_SB_ERRORS_FORMAT_H */ diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index 666599d3fb9d..40325239c3b0 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -4,286 +4,6 @@ #include "darray.h" -#define BCH_SB_ERRS() \ - x(clean_but_journal_not_empty, 0) \ - x(dirty_but_no_journal_entries, 1) \ - x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ - x(sb_clean_journal_seq_mismatch, 3) \ - x(sb_clean_btree_root_mismatch, 4) \ - x(sb_clean_missing, 5) \ - x(jset_unsupported_version, 6) \ - x(jset_unknown_csum, 7) \ - x(jset_last_seq_newer_than_seq, 8) \ - x(jset_past_bucket_end, 9) \ - x(jset_seq_blacklisted, 10) \ - x(journal_entries_missing, 11) \ - x(journal_entry_replicas_not_marked, 12) \ - x(journal_entry_past_jset_end, 13) \ - x(journal_entry_replicas_data_mismatch, 14) \ - x(journal_entry_bkey_u64s_0, 15) \ - x(journal_entry_bkey_past_end, 16) \ - x(journal_entry_bkey_bad_format, 17) \ - x(journal_entry_bkey_invalid, 18) \ - x(journal_entry_btree_root_bad_size, 19) \ - x(journal_entry_blacklist_bad_size, 20) \ - x(journal_entry_blacklist_v2_bad_size, 21) \ - x(journal_entry_blacklist_v2_start_past_end, 22) \ - x(journal_entry_usage_bad_size, 23) \ - x(journal_entry_data_usage_bad_size, 24) \ - x(journal_entry_clock_bad_size, 25) \ - x(journal_entry_clock_bad_rw, 26) \ - x(journal_entry_dev_usage_bad_size, 27) \ - x(journal_entry_dev_usage_bad_dev, 28) \ - x(journal_entry_dev_usage_bad_pad, 29) \ - x(btree_node_unreadable, 30) \ - x(btree_node_fault_injected, 31) \ - x(btree_node_bad_magic, 32) \ - x(btree_node_bad_seq, 33) \ - x(btree_node_unsupported_version, 34) \ - x(btree_node_bset_older_than_sb_min, 35) \ - x(btree_node_bset_newer_than_sb, 36) \ - x(btree_node_data_missing, 37) \ - x(btree_node_bset_after_end, 38) \ - x(btree_node_replicas_sectors_written_mismatch, 39) \ - x(btree_node_replicas_data_mismatch, 40) \ - x(bset_unknown_csum, 41) \ - x(bset_bad_csum, 42) \ - x(bset_past_end_of_btree_node, 43) \ - x(bset_wrong_sector_offset, 44) \ - x(bset_empty, 45) \ - x(bset_bad_seq, 46) \ - x(bset_blacklisted_journal_seq, 47) \ - x(first_bset_blacklisted_journal_seq, 48) \ - x(btree_node_bad_btree, 49) \ - x(btree_node_bad_level, 50) \ - x(btree_node_bad_min_key, 51) \ - x(btree_node_bad_max_key, 52) \ - x(btree_node_bad_format, 53) \ - x(btree_node_bkey_past_bset_end, 54) \ - x(btree_node_bkey_bad_format, 55) \ - x(btree_node_bad_bkey, 56) \ - x(btree_node_bkey_out_of_order, 57) \ - x(btree_root_bkey_invalid, 58) \ - x(btree_root_read_error, 59) \ - x(btree_root_bad_min_key, 60) \ - x(btree_root_bad_max_key, 61) \ - x(btree_node_read_error, 62) \ - x(btree_node_topology_bad_min_key, 63) \ - x(btree_node_topology_bad_max_key, 64) \ - x(btree_node_topology_overwritten_by_prev_node, 65) \ - x(btree_node_topology_overwritten_by_next_node, 66) \ - x(btree_node_topology_interior_node_empty, 67) \ - x(fs_usage_hidden_wrong, 68) \ - x(fs_usage_btree_wrong, 69) \ - x(fs_usage_data_wrong, 70) \ - x(fs_usage_cached_wrong, 71) \ - x(fs_usage_reserved_wrong, 72) \ - x(fs_usage_persistent_reserved_wrong, 73) \ - x(fs_usage_nr_inodes_wrong, 74) \ - x(fs_usage_replicas_wrong, 75) \ - x(dev_usage_buckets_wrong, 76) \ - x(dev_usage_sectors_wrong, 77) \ - x(dev_usage_fragmented_wrong, 78) \ - x(dev_usage_buckets_ec_wrong, 79) \ - x(bkey_version_in_future, 80) \ - x(bkey_u64s_too_small, 81) \ - x(bkey_invalid_type_for_btree, 82) \ - x(bkey_extent_size_zero, 83) \ - x(bkey_extent_size_greater_than_offset, 84) \ - x(bkey_size_nonzero, 85) \ - x(bkey_snapshot_nonzero, 86) \ - x(bkey_snapshot_zero, 87) \ - x(bkey_at_pos_max, 88) \ - x(bkey_before_start_of_btree_node, 89) \ - x(bkey_after_end_of_btree_node, 90) \ - x(bkey_val_size_nonzero, 91) \ - x(bkey_val_size_too_small, 92) \ - x(alloc_v1_val_size_bad, 93) \ - x(alloc_v2_unpack_error, 94) \ - x(alloc_v3_unpack_error, 95) \ - x(alloc_v4_val_size_bad, 96) \ - x(alloc_v4_backpointers_start_bad, 97) \ - x(alloc_key_data_type_bad, 98) \ - x(alloc_key_empty_but_have_data, 99) \ - x(alloc_key_dirty_sectors_0, 100) \ - x(alloc_key_data_type_inconsistency, 101) \ - x(alloc_key_to_missing_dev_bucket, 102) \ - x(alloc_key_cached_inconsistency, 103) \ - x(alloc_key_cached_but_read_time_zero, 104) \ - x(alloc_key_to_missing_lru_entry, 105) \ - x(alloc_key_data_type_wrong, 106) \ - x(alloc_key_gen_wrong, 107) \ - x(alloc_key_dirty_sectors_wrong, 108) \ - x(alloc_key_cached_sectors_wrong, 109) \ - x(alloc_key_stripe_wrong, 110) \ - x(alloc_key_stripe_redundancy_wrong, 111) \ - x(bucket_sector_count_overflow, 112) \ - x(bucket_metadata_type_mismatch, 113) \ - x(need_discard_key_wrong, 114) \ - x(freespace_key_wrong, 115) \ - x(freespace_hole_missing, 116) \ - x(bucket_gens_val_size_bad, 117) \ - x(bucket_gens_key_wrong, 118) \ - x(bucket_gens_hole_wrong, 119) \ - x(bucket_gens_to_invalid_dev, 120) \ - x(bucket_gens_to_invalid_buckets, 121) \ - x(bucket_gens_nonzero_for_invalid_buckets, 122) \ - x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ - x(need_discard_freespace_key_bad, 124) \ - x(backpointer_bucket_offset_wrong, 125) \ - x(backpointer_to_missing_device, 126) \ - x(backpointer_to_missing_alloc, 127) \ - x(backpointer_to_missing_ptr, 128) \ - x(lru_entry_at_time_0, 129) \ - x(lru_entry_to_invalid_bucket, 130) \ - x(lru_entry_bad, 131) \ - x(btree_ptr_val_too_big, 132) \ - x(btree_ptr_v2_val_too_big, 133) \ - x(btree_ptr_has_non_ptr, 134) \ - x(extent_ptrs_invalid_entry, 135) \ - x(extent_ptrs_no_ptrs, 136) \ - x(extent_ptrs_too_many_ptrs, 137) \ - x(extent_ptrs_redundant_crc, 138) \ - x(extent_ptrs_redundant_stripe, 139) \ - x(extent_ptrs_unwritten, 140) \ - x(extent_ptrs_written_and_unwritten, 141) \ - x(ptr_to_invalid_device, 142) \ - x(ptr_to_duplicate_device, 143) \ - x(ptr_after_last_bucket, 144) \ - x(ptr_before_first_bucket, 145) \ - x(ptr_spans_multiple_buckets, 146) \ - x(ptr_to_missing_backpointer, 147) \ - x(ptr_to_missing_alloc_key, 148) \ - x(ptr_to_missing_replicas_entry, 149) \ - x(ptr_to_missing_stripe, 150) \ - x(ptr_to_incorrect_stripe, 151) \ - x(ptr_gen_newer_than_bucket_gen, 152) \ - x(ptr_too_stale, 153) \ - x(stale_dirty_ptr, 154) \ - x(ptr_bucket_data_type_mismatch, 155) \ - x(ptr_cached_and_erasure_coded, 156) \ - x(ptr_crc_uncompressed_size_too_small, 157) \ - x(ptr_crc_csum_type_unknown, 158) \ - x(ptr_crc_compression_type_unknown, 159) \ - x(ptr_crc_redundant, 160) \ - x(ptr_crc_uncompressed_size_too_big, 161) \ - x(ptr_crc_nonce_mismatch, 162) \ - x(ptr_stripe_redundant, 163) \ - x(reservation_key_nr_replicas_invalid, 164) \ - x(reflink_v_refcount_wrong, 165) \ - x(reflink_p_to_missing_reflink_v, 166) \ - x(stripe_pos_bad, 167) \ - x(stripe_val_size_bad, 168) \ - x(stripe_sector_count_wrong, 169) \ - x(snapshot_tree_pos_bad, 170) \ - x(snapshot_tree_to_missing_snapshot, 171) \ - x(snapshot_tree_to_missing_subvol, 172) \ - x(snapshot_tree_to_wrong_subvol, 173) \ - x(snapshot_tree_to_snapshot_subvol, 174) \ - x(snapshot_pos_bad, 175) \ - x(snapshot_parent_bad, 176) \ - x(snapshot_children_not_normalized, 177) \ - x(snapshot_child_duplicate, 178) \ - x(snapshot_child_bad, 179) \ - x(snapshot_skiplist_not_normalized, 180) \ - x(snapshot_skiplist_bad, 181) \ - x(snapshot_should_not_have_subvol, 182) \ - x(snapshot_to_bad_snapshot_tree, 183) \ - x(snapshot_bad_depth, 184) \ - x(snapshot_bad_skiplist, 185) \ - x(subvol_pos_bad, 186) \ - x(subvol_not_master_and_not_snapshot, 187) \ - x(subvol_to_missing_root, 188) \ - x(subvol_root_wrong_bi_subvol, 189) \ - x(bkey_in_missing_snapshot, 190) \ - x(inode_pos_inode_nonzero, 191) \ - x(inode_pos_blockdev_range, 192) \ - x(inode_unpack_error, 193) \ - x(inode_str_hash_invalid, 194) \ - x(inode_v3_fields_start_bad, 195) \ - x(inode_snapshot_mismatch, 196) \ - x(inode_unlinked_but_clean, 197) \ - x(inode_unlinked_but_nlink_nonzero, 198) \ - x(inode_checksum_type_invalid, 199) \ - x(inode_compression_type_invalid, 200) \ - x(inode_subvol_root_but_not_dir, 201) \ - x(inode_i_size_dirty_but_clean, 202) \ - x(inode_i_sectors_dirty_but_clean, 203) \ - x(inode_i_sectors_wrong, 204) \ - x(inode_dir_wrong_nlink, 205) \ - x(inode_dir_multiple_links, 206) \ - x(inode_multiple_links_but_nlink_0, 207) \ - x(inode_wrong_backpointer, 208) \ - x(inode_wrong_nlink, 209) \ - x(inode_unreachable, 210) \ - x(deleted_inode_but_clean, 211) \ - x(deleted_inode_missing, 212) \ - x(deleted_inode_is_dir, 213) \ - x(deleted_inode_not_unlinked, 214) \ - x(extent_overlapping, 215) \ - x(extent_in_missing_inode, 216) \ - x(extent_in_non_reg_inode, 217) \ - x(extent_past_end_of_inode, 218) \ - x(dirent_empty_name, 219) \ - x(dirent_val_too_big, 220) \ - x(dirent_name_too_long, 221) \ - x(dirent_name_embedded_nul, 222) \ - x(dirent_name_dot_or_dotdot, 223) \ - x(dirent_name_has_slash, 224) \ - x(dirent_d_type_wrong, 225) \ - x(inode_bi_parent_wrong, 226) \ - x(dirent_in_missing_dir_inode, 227) \ - x(dirent_in_non_dir_inode, 228) \ - x(dirent_to_missing_inode, 229) \ - x(dirent_to_missing_subvol, 230) \ - x(dirent_to_itself, 231) \ - x(quota_type_invalid, 232) \ - x(xattr_val_size_too_small, 233) \ - x(xattr_val_size_too_big, 234) \ - x(xattr_invalid_type, 235) \ - x(xattr_name_invalid_chars, 236) \ - x(xattr_in_missing_inode, 237) \ - x(root_subvol_missing, 238) \ - x(root_dir_missing, 239) \ - x(root_inode_not_dir, 240) \ - x(dir_loop, 241) \ - x(hash_table_key_duplicate, 242) \ - x(hash_table_key_wrong_offset, 243) \ - x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) \ - x(journal_entry_dup_same_device, 246) \ - x(inode_bi_subvol_missing, 247) \ - x(inode_bi_subvol_wrong, 248) \ - x(inode_points_to_missing_dirent, 249) \ - x(inode_points_to_wrong_dirent, 250) \ - x(inode_bi_parent_nonzero, 251) \ - x(dirent_to_missing_parent_subvol, 252) \ - x(dirent_not_visible_in_parent_subvol, 253) \ - x(subvol_fs_path_parent_wrong, 254) \ - x(subvol_root_fs_path_parent_nonzero, 255) \ - x(subvol_children_not_set, 256) \ - x(subvol_children_bad, 257) \ - x(subvol_loop, 258) \ - x(subvol_unreachable, 259) \ - x(btree_node_bkey_bad_u64s, 260) \ - x(btree_node_topology_empty_interior_node, 261) \ - x(btree_ptr_v2_min_key_bad, 262) \ - x(btree_root_unreadable_and_scan_found_nothing, 263) \ - x(snapshot_node_missing, 264) \ - x(dup_backpointer_to_bad_csum_extent, 265) \ - x(btree_bitmap_not_marked, 266) \ - x(sb_clean_entry_overrun, 267) \ - x(btree_ptr_v2_written_0, 268) \ - x(subvol_snapshot_bad, 269) \ - x(subvol_inode_bad, 270) - -enum bch_sb_error_id { -#define x(t, n) BCH_FSCK_ERR_##t = n, - BCH_SB_ERRS() -#undef x - BCH_SB_ERR_MAX -}; - struct bch_sb_error_entry_cpu { u64 id:16, nr:48; @@ -293,4 +13,3 @@ struct bch_sb_error_entry_cpu { typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; #endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ - diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h new file mode 100644 index 000000000000..e2630548c0f6 --- /dev/null +++ b/fs/bcachefs/sb-members_format.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H +#define _BCACHEFS_SB_MEMBERS_FORMAT_H + +/* + * We refer to members with bitmasks in various places - but we need to get rid + * of this limit: + */ +#define BCH_SB_MEMBERS_MAX 64 + +#define BCH_MIN_NR_NBUCKETS (1 << 6) + +#define BCH_IOPS_MEASUREMENTS() \ + x(seqread, 0) \ + x(seqwrite, 1) \ + x(randread, 2) \ + x(randwrite, 3) + +enum bch_iops_measurement { +#define x(t, n) BCH_IOPS_##t = n, + BCH_IOPS_MEASUREMENTS() +#undef x + BCH_IOPS_NR +}; + +#define BCH_MEMBER_ERROR_TYPES() \ + x(read, 0) \ + x(write, 1) \ + x(checksum, 2) + +enum bch_member_error_type { +#define x(t, n) BCH_MEMBER_ERROR_##t = n, + BCH_MEMBER_ERROR_TYPES() +#undef x + BCH_MEMBER_ERROR_NR +}; + +struct bch_member { + __uuid_t uuid; + __le64 nbuckets; /* device size */ + __le16 first_bucket; /* index of first bucket used */ + __le16 bucket_size; /* sectors */ + __u8 btree_bitmap_shift; + __u8 pad[3]; + __le64 last_mount; /* time_t */ + + __le64 flags; + __le32 iops[4]; + __le64 errors[BCH_MEMBER_ERROR_NR]; + __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; + __le64 errors_reset_time; + __le64 seq; + __le64 btree_allocated_bitmap; + /* + * On recovery from a clean shutdown we don't normally read the journal, + * but we still want to resume writing from where we left off so we + * don't overwrite more than is necessary, for list journal debugging: + */ + __le32 last_journal_bucket; + __le32 last_journal_bucket_offset; +}; + +/* + * This limit comes from the bucket_gens array - it's a single allocation, and + * kernel allocation are limited to INT_MAX + */ +#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64) + +#define BCH_MEMBER_V1_BYTES 56 + +LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) +/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ +LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) +LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20) +LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) +LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) +LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, + struct bch_member, flags, 30, 31) + +#if 0 +LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); +LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); +#endif + +#define BCH_MEMBER_STATES() \ + x(rw, 0) \ + x(ro, 1) \ + x(failed, 2) \ + x(spare, 3) + +enum bch_member_state { +#define x(t, n) BCH_MEMBER_STATE_##t = n, + BCH_MEMBER_STATES() +#undef x + BCH_MEMBER_STATE_NR +}; + +struct bch_sb_field_members_v1 { + struct bch_sb_field field; + struct bch_member _members[]; //Members are now variable size +}; + +struct bch_sb_field_members_v2 { + struct bch_sb_field field; + __le16 member_bytes; //size of single member entry + u8 pad[6]; + struct bch_member _members[]; +}; + +#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */ diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 629900a5e641..51918acfd726 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1042,6 +1042,25 @@ err: return ret; } +int bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, + bkey_in_missing_snapshot, + "key in missing snapshot %s, delete?", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; +fsck_err: + printbuf_exit(&buf); + return ret; +} + /* * Mark a snapshot as deleted, for future cleanup: */ @@ -1351,35 +1370,39 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -static int snapshot_delete_key(struct btree_trans *trans, +static int delete_dead_snapshots_process_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, snapshot_id_list *deleted, snapshot_id_list *equiv_seen, struct bpos *last_pos) { + int ret = bch2_check_key_has_snapshot(trans, iter, k); + if (ret) + return ret < 0 ? ret : 0; + struct bch_fs *c = trans->c; u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + if (!equiv) /* key for invalid snapshot node, but we chose not to delete */ + return 0; if (!bkey_eq(k.k->p, *last_pos)) equiv_seen->nr = 0; - *last_pos = k.k->p; - if (snapshot_list_has_id(deleted, k.k->p.snapshot) || - snapshot_list_has_id(equiv_seen, equiv)) { + if (snapshot_list_has_id(deleted, k.k->p.snapshot)) return bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - } else { - return snapshot_list_add(c, equiv_seen, equiv); - } -} -static int move_key_to_correct_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + if (!bpos_eq(*last_pos, k.k->p) && + snapshot_list_has_id(equiv_seen, equiv)) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); + + *last_pos = k.k->p; + + ret = snapshot_list_add_nodup(c, equiv_seen, equiv); + if (ret) + return ret; /* * When we have a linear chain of snapshot nodes, we consider @@ -1389,21 +1412,20 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans, * * If there are multiple keys in different snapshots at the same * position, we're only going to keep the one in the newest - * snapshot - the rest have been overwritten and are redundant, - * and for the key we're going to keep we need to move it to the - * equivalance class ID if it's not there already. + * snapshot (we delete the others above) - the rest have been + * overwritten and are redundant, and for the key we're going to keep we + * need to move it to the equivalance class ID if it's not there + * already. */ if (equiv != k.k->p.snapshot) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - struct btree_iter new_iter; - int ret; - - ret = PTR_ERR_OR_ZERO(new); + int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; new->k.p.snapshot = equiv; + struct btree_iter new_iter; bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, BTREE_ITER_all_snapshots| BTREE_ITER_cached| @@ -1538,7 +1560,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) struct btree_trans *trans; snapshot_id_list deleted = { 0 }; snapshot_id_list deleted_interior = { 0 }; - u32 id; int ret = 0; if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) @@ -1585,33 +1606,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (ret) goto err; - for (id = 0; id < BTREE_ID_NR; id++) { + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { struct bpos last_pos = POS_MIN; snapshot_id_list equiv_seen = { 0 }; struct disk_reservation res = { 0 }; - if (!btree_type_has_snapshots(id)) - continue; - - /* - * deleted inodes btree is maintained by a trigger on the inodes - * btree - no work for us to do here, and it's not safe to scan - * it because we'll see out of date keys due to the btree write - * buffer: - */ - if (id == BTREE_ID_deleted_inodes) + if (!btree_type_has_snapshots(btree)) continue; ret = for_each_btree_key_commit(trans, iter, - id, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, - snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: - for_each_btree_key_commit(trans, iter, - id, POS_MIN, + btree, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, - move_key_to_correct_snapshot(trans, &iter, k)); + delete_dead_snapshots_process_key(trans, &iter, k, &deleted, + &equiv_seen, &last_pos)); bch2_disk_reservation_put(c, &res); darray_exit(&equiv_seen); diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index ab13d8f4b41e..31b0ee03e962 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -242,6 +242,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); int bch2_reconstruct_snapshots(struct bch_fs *); +int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); void bch2_delete_dead_snapshots_work(struct work_struct *); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index f1bee6c5222d..d73a0222f709 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1132,18 +1132,12 @@ bool bch2_check_version_downgrade(struct bch_fs *c) * c->sb will be checked before we write the superblock, so update it as * well: */ - if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) { + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); - c->sb.version_upgrade_complete = bcachefs_metadata_version_current; - } - if (c->sb.version > bcachefs_metadata_version_current) { + if (c->sb.version > bcachefs_metadata_version_current) c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); - c->sb.version = bcachefs_metadata_version_current; - } - if (c->sb.version_min > bcachefs_metadata_version_current) { + if (c->sb.version_min > bcachefs_metadata_version_current) c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); - c->sb.version_min = bcachefs_metadata_version_current; - } c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); return ret; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 2206a8dee693..df2bea38e83f 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -564,7 +564,7 @@ static void __bch2_fs_free(struct bch_fs *c) BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); - EBUG_ON(percpu_u64_get(c->online_reserved)); + EBUG_ON(c->online_reserved && percpu_u64_get(c->online_reserved)); free_percpu(c->online_reserved); darray_exit(&c->btree_roots_extra); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 91c994b569f3..6ed495ca7a31 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -89,6 +89,16 @@ enum { BTRFS_INODE_FREE_SPACE_INODE, /* Set when there are no capabilities in XATTs for the inode. */ BTRFS_INODE_NO_CAP_XATTR, + /* + * Set if an error happened when doing a COW write before submitting a + * bio or during writeback. Used for both buffered writes and direct IO + * writes. This is to signal a fast fsync that it has to wait for + * ordered extents to complete and therefore not log extent maps that + * point to unwritten extents (when an ordered extent completes and it + * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its + * range). + */ + BTRFS_INODE_COW_WRITE_ERROR, }; /* in memory btrfs inode */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1b20b3e390df..38cdb8875e8e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4538,18 +4538,10 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) { struct rb_node *node; - struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; struct btrfs_delayed_ref_node *ref; - delayed_refs = &trans->delayed_refs; - spin_lock(&delayed_refs->lock); - if (atomic_read(&delayed_refs->num_entries) == 0) { - spin_unlock(&delayed_refs->lock); - btrfs_debug(fs_info, "delayed_refs has NO entry"); - return; - } - while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; struct rb_node *n; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 597387e9f040..f688fab55251 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3689,6 +3689,8 @@ static struct extent_buffer *grab_extent_buffer( struct folio *folio = page_folio(page); struct extent_buffer *exists; + lockdep_assert_held(&page->mapping->i_private_lock); + /* * For subpage case, we completely rely on radix tree to ensure we * don't try to insert two ebs for the same bytenr. So here we always @@ -3756,13 +3758,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) * The caller needs to free the existing folios and retry using the same order. */ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, + struct btrfs_subpage *prealloc, struct extent_buffer **found_eb_ret) { struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; const unsigned long index = eb->start >> PAGE_SHIFT; - struct folio *existing_folio; + struct folio *existing_folio = NULL; int ret; ASSERT(found_eb_ret); @@ -3774,12 +3777,14 @@ retry: ret = filemap_add_folio(mapping, eb->folios[i], index + i, GFP_NOFS | __GFP_NOFAIL); if (!ret) - return 0; + goto finish; existing_folio = filemap_lock_folio(mapping, index + i); /* The page cache only exists for a very short time, just retry. */ - if (IS_ERR(existing_folio)) + if (IS_ERR(existing_folio)) { + existing_folio = NULL; goto retry; + } /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); @@ -3790,14 +3795,13 @@ retry: return -EAGAIN; } - if (fs_info->nodesize < PAGE_SIZE) { - /* - * We're going to reuse the existing page, can drop our page - * and subpage structure now. - */ +finish: + spin_lock(&mapping->i_private_lock); + if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + /* We're going to reuse the existing page, can drop our folio now. */ __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; - } else { + } else if (existing_folio) { struct extent_buffer *existing_eb; existing_eb = grab_extent_buffer(fs_info, @@ -3805,6 +3809,7 @@ retry: if (existing_eb) { /* The extent buffer still exists, we can use it directly. */ *found_eb_ret = existing_eb; + spin_unlock(&mapping->i_private_lock); folio_unlock(existing_folio); folio_put(existing_folio); return 1; @@ -3813,6 +3818,22 @@ retry: __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; } + eb->folio_size = folio_size(eb->folios[i]); + eb->folio_shift = folio_shift(eb->folios[i]); + /* Should not fail, as we have preallocated the memory. */ + ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); + ASSERT(!ret); + /* + * To inform we have an extra eb under allocation, so that + * detach_extent_buffer_page() won't release the folio private when the + * eb hasn't been inserted into radix tree yet. + * + * The ref will be decreased when the eb releases the page, in + * detach_extent_buffer_page(). Thus needs no special handling in the + * error path. + */ + btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); + spin_unlock(&mapping->i_private_lock); return 0; } @@ -3824,7 +3845,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; - struct address_space *mapping = fs_info->btree_inode->i_mapping; struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; bool page_contig = true; @@ -3890,7 +3910,7 @@ reallocate: for (int i = 0; i < num_folios; i++) { struct folio *folio; - ret = attach_eb_folio_to_filemap(eb, i, &existing_eb); + ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); if (ret > 0) { ASSERT(existing_eb); goto out; @@ -3927,24 +3947,6 @@ reallocate: * and free the allocated page. */ folio = eb->folios[i]; - eb->folio_size = folio_size(folio); - eb->folio_shift = folio_shift(folio); - spin_lock(&mapping->i_private_lock); - /* Should not fail, as we have preallocated the memory */ - ret = attach_extent_buffer_folio(eb, folio, prealloc); - ASSERT(!ret); - /* - * To inform we have extra eb under allocation, so that - * detach_extent_buffer_page() won't release the folio private - * when the eb hasn't yet been inserted into radix tree. - * - * The ref will be decreased when the eb released the page, in - * detach_extent_buffer_page(). - * Thus needs no special handling in error path. - */ - btrfs_folio_inc_eb_refs(fs_info, folio); - spin_unlock(&mapping->i_private_lock); - WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); /* diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e764ac3f22e2..d90138683a0a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1885,6 +1885,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len); + clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags); } else { /* * Get our ordered extents as soon as possible to avoid doing @@ -1894,6 +1895,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), &ctx.ordered_extents); ret = filemap_fdatawait_range(inode->i_mapping, start, end); + if (ret) + goto out_release_extents; + + /* + * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after + * starting and waiting for writeback, because for buffered IO + * it may have been set during the end IO callback + * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in + * case an error happened and we need to wait for ordered + * extents to complete so that any extent maps that point to + * unwritten locations are dropped and we don't log them. + */ + if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, + &BTRFS_I(inode)->runtime_flags)) + ret = btrfs_wait_ordered_range(inode, start, len); } if (ret) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c5bdd674f55c..35a413ce935d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -388,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + /* + * If this is a COW write it means we created new extent maps for the + * range and they point to unwritten locations if we got an error either + * before submitting a bio or during IO. + * + * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we + * are queuing its completion below. During completion, at + * btrfs_finish_one_ordered(), we will drop the extent maps for the + * unwritten extents. + * + * However because completion runs in a work queue we can end up having + * a fast fsync running before that. In the case of direct IO, once we + * unlock the inode the fsync might start, and we queue the completion + * before unlocking the inode. In the case of buffered IO when writeback + * finishes (end_bbio_data_write()) we queue the completion, so if the + * writeback was triggered by a fast fsync, the fsync might start + * logging before ordered extent completion runs in the work queue. + * + * The fast fsync will log file extent items based on the extent maps it + * finds, so if by the time it collects extent maps the ordered extent + * completion didn't happen yet, it will log file extent items that + * point to unwritten extents, resulting in a corruption if a crash + * happens and the log tree is replayed. Note that a fast fsync does not + * wait for completion of ordered extents in order to reduce latency. + * + * Set a flag in the inode so that the next fast fsync will wait for + * ordered extents to complete before starting to log. + */ + if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); + if (ret) btrfs_queue_ordered_fn(ordered); return ret; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5146387b416b..26a2e5aa08e9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4860,18 +4860,23 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, path->slots[0]++; continue; } - if (!dropped_extents) { - /* - * Avoid logging extent items logged in past fsync calls - * and leading to duplicate keys in the log tree. - */ + /* + * Avoid overlapping items in the log tree. The first time we + * get here, get rid of everything from a past fsync. After + * that, if the current extent starts before the end of the last + * extent we copied, truncate the last one. This can happen if + * an ordered extent completion modifies the subvolume tree + * while btrfs_next_leaf() has the tree unlocked. + */ + if (!dropped_extents || key.offset < truncate_offset) { ret = truncate_inode_items(trans, root->log_root, inode, - truncate_offset, + min(key.offset, truncate_offset), BTRFS_EXTENT_DATA_KEY); if (ret) goto out; dropped_extents = true; } + truncate_offset = btrfs_file_extent_end(path); if (ins_nr == 0) start_slot = slot; ins_nr++; diff --git a/fs/dcache.c b/fs/dcache.c index 1ee6404b430b..407095188f83 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2360,17 +2360,19 @@ EXPORT_SYMBOL(d_hash_and_lookup); * - unhash this dentry and free it. * * Usually, we want to just turn this into - * a negative dentry, but certain workloads can - * generate a large number of negative dentries. - * Therefore, it would be better to simply - * unhash it. + * a negative dentry, but if anybody else is + * currently using the dentry or the inode + * we can't do that and we fall back on removing + * it from the hash queues and waiting for + * it to be deleted later when it has no users */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * - * Remove the dentry from the hash queues so it can be deleted later. + * Turn the dentry into a negative dentry if possible, otherwise + * remove it from the hash queues so it can be deleted later */ void d_delete(struct dentry * dentry) @@ -2379,8 +2381,6 @@ void d_delete(struct dentry * dentry) spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); - __d_drop(dentry); - /* * Are we the only user? */ @@ -2388,6 +2388,7 @@ void d_delete(struct dentry * dentry) dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { + __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 41c8f0c68ef5..c5802a459334 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -898,11 +898,11 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { loff_t length = iomap_length(iter); - size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; loff_t pos = iter->pos; ssize_t total_written = 0; long status = 0; struct address_space *mapping = iter->inode->i_mapping; + size_t chunk = mapping_max_folio_size(mapping); unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; do { diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 1121601536d1..07bc1fd43530 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -181,7 +181,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct folio *folio, *writethrough = NULL; enum netfs_how_to_modify howto; enum netfs_folio_trace trace; - unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC; + unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0; ssize_t written = 0, ret, ret2; loff_t i_size, pos = iocb->ki_pos, from, to; size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index f516460e994e..e14cd53ac9fd 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -12,7 +12,7 @@ static void netfs_cleanup_dio_write(struct netfs_io_request *wreq) { struct inode *inode = wreq->inode; - unsigned long long end = wreq->start + wreq->len; + unsigned long long end = wreq->start + wreq->transferred; if (!wreq->error && i_size_read(inode) < end) { diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index c90d482b1650..f4a642727479 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -72,6 +72,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } } + atomic_inc(&ctx->io_count); trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); @@ -124,6 +125,7 @@ static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); @@ -142,6 +144,9 @@ static void netfs_free_request(struct work_struct *work) } kvfree(rreq->direct_bv); } + + if (atomic_dec_and_test(&ictx->io_count)) + wake_up_var(&ictx->io_count); call_rcu(&rreq->rcu, netfs_free_request_rcu); } diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 60112e4b2c5e..426cf87aaf2e 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -510,7 +510,7 @@ reassess_streams: * stream has a gap that can be jumped. */ if (notes & SOME_EMPTY) { - unsigned long long jump_to = wreq->start + wreq->len; + unsigned long long jump_to = wreq->start + READ_ONCE(wreq->submitted); for (s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; @@ -690,10 +690,11 @@ void netfs_write_collection_worker(struct work_struct *work) wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); if (wreq->iocb) { - wreq->iocb->ki_pos += wreq->transferred; + size_t written = min(wreq->transferred, wreq->len); + wreq->iocb->ki_pos += written; if (wreq->iocb->ki_complete) wreq->iocb->ki_complete( - wreq->iocb, wreq->error ? wreq->error : wreq->transferred); + wreq->iocb, wreq->error ? wreq->error : written); wreq->iocb = VFS_PTR_POISON; } diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index e190043bc0da..3aa86e268f40 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -254,7 +254,7 @@ static void netfs_issue_write(struct netfs_io_request *wreq, stream->construct = NULL; if (subreq->start + subreq->len > wreq->start + wreq->submitted) - wreq->len = wreq->submitted = subreq->start + subreq->len - wreq->start; + WRITE_ONCE(wreq->submitted, subreq->start + subreq->len - wreq->start); netfs_do_issue_write(stream, subreq); } @@ -636,7 +636,12 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr mutex_unlock(&ictx->wb_lock); - ret = wreq->error; + if (wreq->iocb) { + ret = -EIOCBQUEUED; + } else { + wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); + ret = wreq->error; + } netfs_put_request(wreq, false, netfs_rreq_trace_put_return); return ret; } diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index a002a44ff161..52e50b1b7f22 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -607,7 +607,7 @@ int nilfs_empty_dir(struct inode *inode) kaddr = nilfs_get_folio(inode, i, &folio); if (IS_ERR(kaddr)) - continue; + return 0; de = (struct nilfs_dir_entry *)kaddr; kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 60d4f59f7665..6ea81f1d5094 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1652,6 +1652,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) if (bh->b_folio != bd_folio) { if (bd_folio) { folio_lock(bd_folio); + folio_wait_writeback(bd_folio); folio_clear_dirty_for_io(bd_folio); folio_start_writeback(bd_folio); folio_unlock(bd_folio); @@ -1665,6 +1666,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) if (bh == segbuf->sb_super_root) { if (bh->b_folio != bd_folio) { folio_lock(bd_folio); + folio_wait_writeback(bd_folio); folio_clear_dirty_for_io(bd_folio); folio_start_writeback(bd_folio); folio_unlock(bd_folio); @@ -1681,6 +1683,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) } if (bd_folio) { folio_lock(bd_folio); + folio_wait_writeback(bd_folio); folio_clear_dirty_for_io(bd_folio); folio_start_writeback(bd_folio); folio_unlock(bd_folio); diff --git a/fs/proc/base.c b/fs/proc/base.c index 18550c071d71..72a1acd03675 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3214,7 +3214,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); - seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages); + seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm)); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); diff --git a/fs/signalfd.c b/fs/signalfd.c index 4a5614442dbf..ec7b2da2477a 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -282,14 +282,10 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); - return ufd; + return PTR_ERR(file); } file->f_mode |= FMODE_NOWAIT; - /* - * When we call this, the initialization must be complete, since - * anon_inode_getfd() will install the fd. - */ fd_install(ufd, file); } else { struct fd f = fdget(ufd); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index a665aac9be9f..bb86fc0641d8 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -431,6 +431,7 @@ cifs_free_inode(struct inode *inode) static void cifs_evict_inode(struct inode *inode) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_state & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index c46d418c1c0c..a2072ab9e586 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -2574,7 +2574,7 @@ typedef struct { struct win_dev { - unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO*/ + unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO or LnxSOCK */ __le64 major; __le64 minor; } __attribute__((packed)); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 262576573eb5..4a8aa1de9522 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -606,6 +606,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); fattr->cf_rdev = MKDEV(mjr, mnr); } + } else if (memcmp("LnxSOCK", pbuf, 8) == 0) { + cifs_dbg(FYI, "Socket\n"); + fattr->cf_mode |= S_IFSOCK; + fattr->cf_dtype = DT_SOCK; } else if (memcmp("IntxLNK", pbuf, 7) == 0) { cifs_dbg(FYI, "Symlink\n"); fattr->cf_mode |= S_IFLNK; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 4ce6c3121a7e..c8e536540895 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4997,6 +4997,9 @@ static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode, pdev.major = cpu_to_le64(MAJOR(dev)); pdev.minor = cpu_to_le64(MINOR(dev)); break; + case S_IFSOCK: + strscpy(pdev.type, "LnxSOCK"); + break; case S_IFIFO: strscpy(pdev.type, "LnxFIFO"); break; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 993ac36c3d58..38a06e8a0f90 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4577,8 +4577,6 @@ smb2_readv_callback(struct mid_q_entry *mid) if (rdata->subreq.start < rdata->subreq.rreq->i_size) rdata->result = 0; } - if (rdata->result == 0 || rdata->result == -EAGAIN) - iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes); rdata->credits.value = 0; netfs_subreq_terminated(&rdata->subreq, (rdata->result == 0 || rdata->result == -EAGAIN) ? @@ -4789,7 +4787,6 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->result = -ENOSPC; else wdata->subreq.len = written; - iov_iter_advance(&wdata->subreq.io_iter, written); break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index 02135a605305..1476c445cadc 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -216,8 +216,8 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid) } tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid); if (!tcon) { - cifs_put_smb_ses(ses); spin_unlock(&cifs_tcp_ses_lock); + cifs_put_smb_ses(ses); return NULL; } spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/smb/common/cifs_arc4.c b/fs/smb/common/cifs_arc4.c index 043e4cb839fa..df360ca47826 100644 --- a/fs/smb/common/cifs_arc4.c +++ b/fs/smb/common/cifs_arc4.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include "arc4.h" +MODULE_DESCRIPTION("ARC4 Cipher Algorithm"); MODULE_LICENSE("GPL"); int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len) diff --git a/fs/smb/common/cifs_md4.c b/fs/smb/common/cifs_md4.c index 50f78cfc6ce9..7ee7f4dad90c 100644 --- a/fs/smb/common/cifs_md4.c +++ b/fs/smb/common/cifs_md4.c @@ -24,6 +24,7 @@ #include <asm/byteorder.h> #include "md4.h" +MODULE_DESCRIPTION("MD4 Message Digest Algorithm (RFC1320)"); MODULE_LICENSE("GPL"); static inline u32 lshift(u32 x, unsigned int s) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 6cb8b2ddc541..6c55a6e88eba 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1008,13 +1008,12 @@ xfs_alloc_cur_finish( struct xfs_alloc_arg *args, struct xfs_alloc_cur *acur) { - struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; int error; ASSERT(acur->cnt && acur->bnolt); ASSERT(acur->bno >= acur->rec_bno); ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); - ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); + ASSERT(xfs_verify_agbext(args->pag, acur->rec_bno, acur->rec_len)); error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, acur->rec_len, acur->bno, acur->len, 0); @@ -1217,7 +1216,6 @@ STATIC int /* error */ xfs_alloc_ag_vextent_exact( xfs_alloc_arg_t *args) /* allocation argument structure */ { - struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ int error; @@ -1297,7 +1295,7 @@ xfs_alloc_ag_vextent_exact( */ cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, args->pag); - ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); + ASSERT(xfs_verify_agbext(args->pag, args->agbno, args->len)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); if (error) { diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 430cd3244c14..f30bcc64100d 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -329,26 +329,20 @@ xfs_attr_calc_size( return nblks; } -/* Initialize transaction reservation for attr operations */ -void -xfs_init_attr_trans( - struct xfs_da_args *args, - struct xfs_trans_res *tres, - unsigned int *total) +/* Initialize transaction reservation for an xattr set/replace/upsert */ +inline struct xfs_trans_res +xfs_attr_set_resv( + const struct xfs_da_args *args) { - struct xfs_mount *mp = args->dp->i_mount; - - if (args->value) { - tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * - args->total; - tres->tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres->tr_logflags = XFS_TRANS_PERM_LOG_RES; - *total = args->total; - } else { - *tres = M_RES(mp)->tr_attrrm; - *total = XFS_ATTRRM_SPACE_RES(mp); - } + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_trans_res ret = { + .tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args->total, + .tr_logcount = XFS_ATTRSET_LOG_COUNT, + .tr_logflags = XFS_TRANS_PERM_LOG_RES, + }; + + return ret; } /* @@ -1006,7 +1000,7 @@ xfs_attr_set( struct xfs_trans_res tres; int error, local; int rmt_blks = 0; - unsigned int total; + unsigned int total = 0; ASSERT(!args->trans); @@ -1033,10 +1027,15 @@ xfs_attr_set( if (!local) rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen); + + tres = xfs_attr_set_resv(args); + total = args->total; break; case XFS_ATTRUPDATE_REMOVE: XFS_STATS_INC(mp, xs_attr_remove); rmt_blks = xfs_attr3_max_rmt_blocks(mp); + tres = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); break; } @@ -1044,7 +1043,6 @@ xfs_attr_set( * Root fork attributes can use reserved data blocks for this * operation if necessary */ - xfs_init_attr_trans(args, &tres, &total); error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 088cb7b30168..0e51d0723f9a 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -565,8 +565,7 @@ bool xfs_attr_check_namespace(unsigned int attr_flags); bool xfs_attr_namecheck(unsigned int attr_flags, const void *name, size_t length); int xfs_attr_calc_size(struct xfs_da_args *args, int *local); -void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, - unsigned int *total); +struct xfs_trans_res xfs_attr_set_resv(const struct xfs_da_args *args); /* * Check to see if the attr should be upgraded from non-existent or shortform to diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 3b3206d312d6..c101cf266bc4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6383,6 +6383,7 @@ xfs_bunmapi_range( error = xfs_defer_finish(tpp); if (error) goto out; + cond_resched(); } out: return error; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index d79002343d0b..e7a7bfbe75b4 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -374,17 +374,37 @@ xfs_dinode_verify_fork( /* * For fork types that can contain local data, check that the fork * format matches the size of local data contained within the fork. - * - * For all types, check that when the size says the should be in extent - * or btree format, the inode isn't claiming it is in local format. */ if (whichfork == XFS_DATA_FORK) { - if (S_ISDIR(mode) || S_ISLNK(mode)) { + /* + * A directory small enough to fit in the inode must be stored + * in local format. The directory sf <-> extents conversion + * code updates the directory size accordingly. + */ + if (S_ISDIR(mode)) { + if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } + + /* + * A symlink with a target small enough to fit in the inode can + * be stored in extents format if xattrs were added (thus + * converting the data fork from shortform to remote format) + * and then removed. + */ + if (S_ISLNK(mode)) { if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_EXTENTS && fork_format != XFS_DINODE_FMT_LOCAL) return __this_address; } + /* + * For all types, check that when the size says the fork should + * be in extent or btree format, the inode isn't claiming to be + * in local format. + */ if (be64_to_cpu(dip->di_size) > fork_size && fork_format == XFS_DINODE_FMT_LOCAL) return __this_address; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index c013f0ba4f36..4cbcf7a86dbe 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -856,7 +856,7 @@ xfs_ioc_scrubv_metadata( if (vec_bytes > PAGE_SIZE) return -ENOMEM; - uvectors = (void __user *)(uintptr_t)head.svh_vectors; + uvectors = u64_to_user_ptr(head.svh_vectors); vectors = memdup_user(uvectors, vec_bytes); if (IS_ERR(vectors)) return PTR_ERR(vectors); diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index 9185ae7088d4..cdd13ed9c569 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -822,12 +822,14 @@ xfarray_sort_scan( /* Grab the first folio that backs this array element. */ if (!si->folio) { + struct folio *folio; loff_t next_pos; - si->folio = xfile_get_folio(si->array->xfile, idx_pos, + folio = xfile_get_folio(si->array->xfile, idx_pos, si->array->obj_size, XFILE_ALLOC); - if (IS_ERR(si->folio)) - return PTR_ERR(si->folio); + if (IS_ERR(folio)) + return PTR_ERR(folio); + si->folio = folio; si->first_folio_idx = xfarray_idx(si->array, folio_pos(si->folio) + si->array->obj_size - 1); @@ -1048,6 +1050,7 @@ xfarray_sort( out_free: trace_xfarray_sort_stats(si, error); + xfarray_sort_scan_done(si); kvfree(si); return error; } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 2b10ac4c5fce..f683b7a9323f 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -746,7 +746,7 @@ xfs_attr_recover_work( struct xfs_attri_log_format *attrp; struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error; - int total; + unsigned int total = 0; /* * First check the validity of the attr described by the ATTRI. If any @@ -763,7 +763,20 @@ xfs_attr_recover_work( return PTR_ERR(attr); args = attr->xattri_da_args; - xfs_init_attr_trans(args, &resv, &total); + switch (xfs_attr_intent_op(attr)) { + case XFS_ATTRI_OP_FLAGS_PPTR_SET: + case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: + resv = xfs_attr_set_resv(args); + total = args->total; + break; + case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: + case XFS_ATTRI_OP_FLAGS_REMOVE: + resv = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); + break; + } resv = xlog_recover_resv(&resv); error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c index c8785ed59543..a3f16e9b6fe5 100644 --- a/fs/xfs/xfs_handle.c +++ b/fs/xfs/xfs_handle.c @@ -773,11 +773,6 @@ xfs_getparents_expand_lastrec( trace_xfs_getparents_expand_lastrec(gpx->ip, gp, &gpx->context, gpr); } -static inline void __user *u64_to_uptr(u64 val) -{ - return (void __user *)(uintptr_t)val; -} - /* Retrieve the parent pointers for a given inode. */ STATIC int xfs_getparents( @@ -862,7 +857,7 @@ xfs_getparents( ASSERT(gpx->context.firstu <= gpx->gph.gph_request.gp_bufsize); /* Copy the records to userspace. */ - if (copy_to_user(u64_to_uptr(gpx->gph.gph_request.gp_buffer), + if (copy_to_user(u64_to_user_ptr(gpx->gph.gph_request.gp_buffer), gpx->krecords, gpx->context.firstu)) error = -EFAULT; diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 730c8d48da28..86f14ec7c31f 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -351,7 +351,6 @@ xfs_iwalk_run_callbacks( int *has_more) { struct xfs_mount *mp = iwag->mp; - struct xfs_inobt_rec_incore *irec; xfs_agino_t next_agino; int error; @@ -361,8 +360,8 @@ xfs_iwalk_run_callbacks( /* Delete cursor but remember the last record we cached... */ xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0); - irec = &iwag->recs[iwag->nr_recs - 1]; - ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK); + ASSERT(next_agino >= iwag->recs[iwag->nr_recs - 1].ir_startino + + XFS_INODES_PER_CHUNK); if (iwag->drop_trans) { xfs_trans_cancel(iwag->tp); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 063a2e00d169..265a2a418bc7 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1387,6 +1387,7 @@ xfs_reflink_remap_blocks( destoff += imap.br_blockcount; len -= imap.br_blockcount; remapped_len += imap.br_blockcount; + cond_resched(); } if (error) |