diff options
Diffstat (limited to 'fs')
295 files changed, 7621 insertions, 4941 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 22f7ccd58d38..0f628041e3f7 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -460,8 +460,10 @@ static int __init init_v9fs(void) ret = v9fs_mux_global_init(); if (!ret) - ret = register_filesystem(&v9fs_fs_type); - + return ret; + ret = register_filesystem(&v9fs_fs_type); + if (!ret) + v9fs_mux_global_exit(); return ret; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index eae50c9d6dc4..7a7ec2d1d2f4 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -204,7 +204,6 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) inode->i_mode = mode; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; - inode->i_blksize = sb->s_blocksize; inode->i_blocks = 0; inode->i_rdev = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -950,9 +949,8 @@ v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode, inode->i_size = stat->length; - inode->i_blksize = sb->s_blocksize; inode->i_blocks = - (inode->i_size + inode->i_blksize - 1) >> sb->s_blocksize_bits; + (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; } /** diff --git a/fs/Kconfig b/fs/Kconfig index 530581628311..4fd9efac29ab 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -826,6 +826,25 @@ config PROC_VMCORE help Exports the dump image of crashed kernel in ELF format. +config PROC_SYSCTL + bool "Sysctl support (/proc/sys)" if EMBEDDED + depends on PROC_FS + select SYSCTL + default y + ---help--- + The sysctl interface provides a means of dynamically changing + certain kernel parameters and variables on the fly without requiring + a recompile of the kernel or reboot of the system. The primary + interface is through /proc/sys. If you say Y here a tree of + modifiable sysctl entries will be generated beneath the + /proc/sys directory. They are explained in the files + in <file:Documentation/sysctl/>. Note that enabling this + option will enlarge the kernel by at least 8 KB. + + As it is generally a good thing, you should say Y here unless + building a kernel for install/rescue disks or your system is very + limited in memory. + config SYSFS bool "sysfs file system support" if EMBEDDED default y @@ -862,6 +881,19 @@ config TMPFS See <file:Documentation/filesystems/tmpfs.txt> for details. +config TMPFS_POSIX_ACL + bool "Tmpfs POSIX Access Control Lists" + depends on TMPFS + select GENERIC_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N. + config HUGETLBFS bool "HugeTLB file system support" depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN @@ -1471,8 +1503,8 @@ config NFS_V4 If unsure, say N. config NFS_DIRECTIO - bool "Allow direct I/O on NFS files (EXPERIMENTAL)" - depends on NFS_FS && EXPERIMENTAL + bool "Allow direct I/O on NFS files" + depends on NFS_FS help This option enables applications to perform uncached I/O on files in NFS file systems using the O_DIRECT open() flag. When O_DIRECT @@ -1921,6 +1953,10 @@ config 9P_FS If unsure, say N. +config GENERIC_ACL + bool + select FS_POSIX_ACL + endmenu menu "Partition Types" diff --git a/fs/Makefile b/fs/Makefile index 89135428a539..46b8cfe497b2 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ +obj-$(CONFIG_GENERIC_ACL) += generic_acl.o obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 534f3eecc985..7e7a04be1278 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -269,7 +269,6 @@ adfs_iget(struct super_block *sb, struct object_info *obj) inode->i_ino = obj->file_id; inode->i_size = obj->size; inode->i_nlink = 2; - inode->i_blksize = PAGE_SIZE; inode->i_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 82011019494c..9ade139086fc 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -251,8 +251,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(adfs_inode_cachep)) - printk(KERN_INFO "adfs_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(adfs_inode_cachep); } static struct super_operations adfs_sops = { @@ -339,11 +338,10 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags |= MS_NODIRATIME; - asb = kmalloc(sizeof(*asb), GFP_KERNEL); + asb = kzalloc(sizeof(*asb), GFP_KERNEL); if (!asb) return -ENOMEM; sb->s_fs_info = asb; - memset(asb, 0, sizeof(*asb)); /* set default options */ asb->s_uid = 0; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 0ddd4cc0d1a0..1dc8438ef389 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -1,7 +1,6 @@ #include <linux/types.h> #include <linux/fs.h> #include <linux/buffer_head.h> -#include <linux/affs_fs.h> #include <linux/amigaffs.h> /* AmigaOS allows file names with up to 30 characters length. diff --git a/fs/affs/super.c b/fs/affs/super.c index 5200f4938df0..5ea72c3a16c3 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/statfs.h> #include <linux/parser.h> +#include <linux/magic.h> #include "affs.h" extern struct timezone sys_tz; @@ -108,8 +109,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(affs_inode_cachep)) - printk(KERN_INFO "affs_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(affs_inode_cachep); } static struct super_operations affs_sops = { @@ -279,11 +279,10 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &affs_sops; sb->s_flags |= MS_NODIRATIME; - sbi = kmalloc(sizeof(struct affs_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; - memset(sbi, 0, sizeof(*sbi)); init_MUTEX(&sbi->s_bmlock); if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 4ebb30a50ed5..6f37754906c2 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -72,7 +72,6 @@ static int afs_inode_map_status(struct afs_vnode *vnode) inode->i_ctime.tv_sec = vnode->status.mtime_server; inode->i_ctime.tv_nsec = 0; inode->i_atime = inode->i_mtime = inode->i_ctime; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_version = vnode->fid.unique; inode->i_mapping->a_ops = &afs_fs_aops; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 101d21b6c037..86463ec9ccb4 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -775,6 +775,7 @@ static int afs_proc_cell_servers_release(struct inode *inode, * first item */ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos) + __acquires(m->private->sv_lock) { struct list_head *_p; struct afs_cell *cell = m->private; @@ -823,6 +824,7 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v, * clean up after reading from the cells list */ static void afs_proc_cell_servers_stop(struct seq_file *p, void *v) + __releases(p->private->sv_lock) { struct afs_cell *cell = p->private; diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index 331f730a1fb3..782ee7c600ca 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -281,11 +281,10 @@ int afs_vlocation_lookup(struct afs_cell *cell, spin_unlock(&cell->vl_gylock); /* not in the cell's in-memory lists - create a new record */ - vlocation = kmalloc(sizeof(struct afs_vlocation), GFP_KERNEL); + vlocation = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL); if (!vlocation) return -ENOMEM; - memset(vlocation, 0, sizeof(struct afs_vlocation)); atomic_set(&vlocation->usage, 1); INIT_LIST_HEAD(&vlocation->link); rwlock_init(&vlocation->lock); diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 0ff4b86476e3..768c6dbd323a 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -186,11 +186,10 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath, _debug("creating new volume record"); ret = -ENOMEM; - volume = kmalloc(sizeof(struct afs_volume), GFP_KERNEL); + volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL); if (!volume) goto error_up; - memset(volume, 0, sizeof(struct afs_volume)); atomic_set(&volume->usage, 1); volume->type = type; volume->type_force = force; diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index a62327f1bdff..c7700d9b3f96 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -37,8 +37,6 @@ #define DPRINTK(D) ((void)0) #endif -#define AUTOFS_SUPER_MAGIC 0x0187 - /* * If the daemon returns a negative response (AUTOFS_IOC_FAIL) then the * kernel will keep the negative response cached for up to the time given diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 65e5ed42190e..2c9759baad61 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -16,6 +16,7 @@ #include <linux/file.h> #include <linux/parser.h> #include <linux/bitops.h> +#include <linux/magic.h> #include "autofs_i.h" #include <linux/module.h> @@ -128,10 +129,9 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) struct autofs_sb_info *sbi; int minproto, maxproto; - sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if ( !sbi ) goto fail_unlock; - memset(sbi, 0, sizeof(*sbi)); DPRINTK(("autofs: starting up, sbi = %p\n",sbi)); s->s_fs_info = sbi; @@ -216,7 +216,6 @@ static void autofs_read_inode(struct inode *inode) inode->i_nlink = 2; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_blocks = 0; - inode->i_blksize = 1024; if ( ino == AUTOFS_ROOT_INO ) { inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; @@ -241,7 +240,7 @@ static void autofs_read_inode(struct inode *inode) inode->i_op = &autofs_symlink_inode_operations; sl = &sbi->symlink[n]; - inode->u.generic_ip = sl; + inode->i_private = sl; inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime; inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c index 52e8772b066e..c74f2eb65775 100644 --- a/fs/autofs/symlink.c +++ b/fs/autofs/symlink.c @@ -15,7 +15,7 @@ /* Nothing to release.. */ static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd) { - char *s=((struct autofs_symlink *)dentry->d_inode->u.generic_ip)->data; + char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data; nd_set_link(nd, s); return NULL; } diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index d6603d02304c..480ab178cba5 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -40,8 +40,6 @@ #define DPRINTK(fmt,args...) do {} while(0) #endif -#define AUTOFS_SUPER_MAGIC 0x0187 - /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this structure. It holds a reference to the dentry, so dentries are never diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 8dbd44f10e9d..d96e5c14a9ca 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -32,7 +32,7 @@ static inline int autofs4_can_expire(struct dentry *dentry, if (!do_now) { /* Too young to die */ - if (time_after(ino->last_used + timeout, now)) + if (!timeout || time_after(ino->last_used + timeout, now)) return 0; /* update last_used here :- @@ -253,7 +253,7 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb, struct dentry *root = dget(sb->s_root); int do_now = how & AUTOFS_EXP_IMMEDIATE; - if (!sbi->exp_timeout || !root) + if (!root) return NULL; now = jiffies; @@ -293,7 +293,7 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, int do_now = how & AUTOFS_EXP_IMMEDIATE; int exp_leaves = how & AUTOFS_EXP_LEAVES; - if ( !sbi->exp_timeout || !root ) + if (!root) return NULL; now = jiffies; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index fde78b110ddd..800ce876caec 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -19,6 +19,7 @@ #include <linux/parser.h> #include <linux/bitops.h> #include <linux/smp_lock.h> +#include <linux/magic.h> #include "autofs_i.h" #include <linux/module.h> @@ -446,7 +447,6 @@ struct inode *autofs4_get_inode(struct super_block *sb, inode->i_uid = 0; inode->i_gid = 0; } - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 5100f984783f..563ef9d7da9f 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -137,7 +137,9 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) nd.flags = LOOKUP_DIRECTORY; ret = (dentry->d_op->d_revalidate)(dentry, &nd); - if (!ret) { + if (ret <= 0) { + if (ret < 0) + status = ret; dcache_dir_close(inode, file); goto out; } @@ -279,9 +281,6 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) DPRINTK("mount done status=%d", status); - if (status && dentry->d_inode) - return status; /* Try to get the kernel to invalidate this dentry */ - /* Turn this into a real negative dentry? */ if (status == -ENOENT) { spin_lock(&dentry->d_lock); @@ -357,7 +356,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) * don't try to mount it again. */ spin_lock(&dcache_lock); - if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { + if (!d_mountpoint(dentry) && __simple_empty(dentry)) { spin_unlock(&dcache_lock); status = try_to_fill_dentry(dentry, 0); @@ -400,13 +399,23 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); int oz_mode = autofs4_oz_mode(sbi); int flags = nd ? nd->flags : 0; - int status = 0; + int status = 1; /* Pending dentry */ if (autofs4_ispending(dentry)) { - if (!oz_mode) - status = try_to_fill_dentry(dentry, flags); - return !status; + /* The daemon never causes a mount to trigger */ + if (oz_mode) + return 1; + + /* + * A zero status is success otherwise we have a + * negative error code. + */ + status = try_to_fill_dentry(dentry, flags); + if (status == 0) + return 1; + + return status; } /* Negative dentry.. invalidate if "old" */ @@ -421,9 +430,19 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd) DPRINTK("dentry=%p %.*s, emptydir", dentry, dentry->d_name.len, dentry->d_name.name); spin_unlock(&dcache_lock); - if (!oz_mode) - status = try_to_fill_dentry(dentry, flags); - return !status; + /* The daemon never causes a mount to trigger */ + if (oz_mode) + return 1; + + /* + * A zero status is success otherwise we have a + * negative error code. + */ + status = try_to_fill_dentry(dentry, flags); + if (status == 0) + return 1; + + return status; } spin_unlock(&dcache_lock); @@ -518,6 +537,9 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s return ERR_PTR(-ERESTARTNOINTR); } } + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; + spin_unlock(&dentry->d_lock); } /* diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 50cfca5c7efd..57020c7a7e65 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -365,7 +365,6 @@ befs_read_inode(struct inode *inode) inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */ inode->i_ctime = inode->i_mtime; inode->i_atime = inode->i_mtime; - inode->i_blksize = befs_sb->block_size; befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num); befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent); @@ -446,9 +445,7 @@ befs_init_inodecache(void) static void befs_destroy_inodecache(void) { - if (kmem_cache_destroy(befs_inode_cachep)) - printk(KERN_ERR "befs_destroy_inodecache: " - "not all structures were freed\n"); + kmem_cache_destroy(befs_inode_cachep); } /* diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 26fad9621738..dcf04cb13283 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -102,7 +102,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode, inode->i_uid = current->fsuid; inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; - inode->i_blocks = inode->i_blksize = 0; + inode->i_blocks = 0; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index cf74f3d4d966..ed27ffb3459e 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -76,7 +76,6 @@ static void bfs_read_inode(struct inode * inode) inode->i_size = BFS_FILESIZE(di); inode->i_blocks = BFS_FILEBLOCKS(di); if (inode->i_size || inode->i_blocks) dprintf("Registered inode with %lld size, %ld blocks\n", inode->i_size, inode->i_blocks); - inode->i_blksize = PAGE_SIZE; inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime); inode->i_ctime.tv_sec = le32_to_cpu(di->i_ctime); @@ -268,8 +267,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(bfs_inode_cachep)) - printk(KERN_INFO "bfs_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(bfs_inode_cachep); } static struct super_operations bfs_sops = { @@ -311,11 +309,10 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) unsigned i, imap_len; struct bfs_sb_info * info; - info = kmalloc(sizeof(*info), GFP_KERNEL); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; s->s_fs_info = info; - memset(info, 0, sizeof(*info)); sb_set_blocksize(s, BFS_BSIZE); @@ -338,10 +335,9 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) + BFS_ROOT_INO - 1; imap_len = info->si_lasti/8 + 1; - info->si_imap = kmalloc(imap_len, GFP_KERNEL); + info->si_imap = kzalloc(imap_len, GFP_KERNEL); if (!info->si_imap) goto out; - memset(info->si_imap, 0, imap_len); for (i=0; i<BFS_ROOT_INO; i++) set_bit(i, info->si_imap); diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index f312103434d4..517e111bb7ef 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -278,6 +278,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) return -ENOEXEC; } + /* + * Requires a mmap handler. This prevents people from using a.out + * as part of an exploit attack against /proc-related vulnerabilities. + */ + if (!bprm->file->f_op || !bprm->file->f_op->mmap) + return -ENOEXEC; + fd_offset = N_TXTOFF(ex); /* Check initial limits. This avoids letting people circumvent @@ -476,6 +483,13 @@ static int load_aout_library(struct file *file) goto out; } + /* + * Requires a mmap handler. This prevents people from using a.out + * as part of an exploit attack against /proc-related vulnerabilities. + */ + if (!file->f_op || !file->f_op->mmap) + goto out; + if (N_FLAGS(ex)) goto out; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 672a3b90bc55..6eb48e1446ec 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -515,7 +515,8 @@ static unsigned long randomize_stack_top(unsigned long stack_top) { unsigned int random_variable = 0; - if (current->flags & PF_RANDOMIZE) { + if ((current->flags & PF_RANDOMIZE) && + !(current->personality & ADDR_NO_RANDOMIZE)) { random_variable = get_random_int() & STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } @@ -1037,10 +1038,8 @@ out_free_interp: out_free_file: sys_close(elf_exec_fileno); out_free_fh: - if (files) { - put_files_struct(current->files); - current->files = files; - } + if (files) + reset_files_struct(current, files); out_free_ph: kfree(elf_phdata); goto out; @@ -1262,7 +1261,7 @@ static void fill_elf_header(struct elfhdr *elf, int segs) return; } -static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, off_t offset) +static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) { phdr->p_type = PT_NOTE; phdr->p_offset = offset; @@ -1428,7 +1427,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file) int i; struct vm_area_struct *vma; struct elfhdr *elf = NULL; - off_t offset = 0, dataoff; + loff_t offset = 0, dataoff; unsigned long limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; int numnote; struct memelfnote *notes = NULL; @@ -1480,20 +1479,19 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file) if (signr) { struct elf_thread_status *tmp; - read_lock(&tasklist_lock); + rcu_read_lock(); do_each_thread(g,p) if (current->mm == p->mm && current != p) { tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); if (!tmp) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); goto cleanup; } - INIT_LIST_HEAD(&tmp->list); tmp->thread = p; list_add(&tmp->list, &thread_list); } while_each_thread(g,p); - read_unlock(&tasklist_lock); + rcu_read_unlock(); list_for_each(t, &thread_list) { struct elf_thread_status *tmp; int sz; @@ -1661,11 +1659,11 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file) ELF_CORE_WRITE_EXTRA_DATA; #endif - if ((off_t)file->f_pos != offset) { + if (file->f_pos != offset) { /* Sanity check */ printk(KERN_WARNING - "elf_core_dump: file->f_pos (%ld) != offset (%ld)\n", - (off_t)file->f_pos, offset); + "elf_core_dump: file->f_pos (%Ld) != offset (%Ld)\n", + file->f_pos, offset); } end_coredump: diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 2f3365829229..f86d5c9ce5eb 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1597,20 +1597,19 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, if (signr) { struct elf_thread_status *tmp; - read_lock(&tasklist_lock); + rcu_read_lock(); do_each_thread(g,p) if (current->mm == p->mm && current != p) { tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); if (!tmp) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); goto cleanup; } - INIT_LIST_HEAD(&tmp->list); tmp->thread = p; list_add(&tmp->list, &thread_list); } while_each_thread(g,p); - read_unlock(&tasklist_lock); + rcu_read_unlock(); list_for_each(t, &thread_list) { struct elf_thread_status *tmp; int sz; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 34ebbc191e46..1713c48fef54 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -215,10 +215,8 @@ _error: bprm->interp_flags = 0; bprm->interp_data = 0; _unshare: - if (files) { - put_files_struct(current->files); - current->files = files; - } + if (files) + reset_files_struct(current, files); goto _ret; } @@ -507,7 +505,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) inode->i_mode = mode; inode->i_uid = 0; inode->i_gid = 0; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); @@ -517,7 +514,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) static void bm_clear_inode(struct inode *inode) { - kfree(inode->u.generic_ip); + kfree(inode->i_private); } static void kill_node(Node *e) @@ -545,7 +542,7 @@ static void kill_node(Node *e) static ssize_t bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos) { - Node *e = file->f_dentry->d_inode->u.generic_ip; + Node *e = file->f_dentry->d_inode->i_private; loff_t pos = *ppos; ssize_t res; char *page; @@ -579,7 +576,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct dentry *root; - Node *e = file->f_dentry->d_inode->u.generic_ip; + Node *e = file->f_dentry->d_inode->i_private; int res = parse_command(buffer, count); switch (res) { @@ -646,7 +643,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, } e->dentry = dget(dentry); - inode->u.generic_ip = e; + inode->i_private = e; inode->i_fop = &bm_entry_operations; d_instantiate(dentry, inode); diff --git a/fs/block_dev.c b/fs/block_dev.c index 045f98854f14..4346468139e8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -543,11 +543,11 @@ static struct kobject *bdev_get_holder(struct block_device *bdev) return kobject_get(bdev->bd_disk->holder_dir); } -static void add_symlink(struct kobject *from, struct kobject *to) +static int add_symlink(struct kobject *from, struct kobject *to) { if (!from || !to) - return; - sysfs_create_link(from, to, kobject_name(to)); + return 0; + return sysfs_create_link(from, to, kobject_name(to)); } static void del_symlink(struct kobject *from, struct kobject *to) @@ -648,30 +648,38 @@ static void free_bd_holder(struct bd_holder *bo) * If there is no matching entry with @bo in @bdev->bd_holder_list, * add @bo to the list, create symlinks. * - * Returns 1 if @bo was added to the list. - * Returns 0 if @bo wasn't used by any reason and should be freed. + * Returns 0 if symlinks are created or already there. + * Returns -ve if something fails and @bo can be freed. */ static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) { struct bd_holder *tmp; + int ret; if (!bo) - return 0; + return -EINVAL; list_for_each_entry(tmp, &bdev->bd_holder_list, list) { if (tmp->sdir == bo->sdir) { tmp->count++; + /* We've already done what we need to do here. */ + free_bd_holder(bo); return 0; } } if (!bd_holder_grab_dirs(bdev, bo)) - return 0; + return -EBUSY; - add_symlink(bo->sdir, bo->sdev); - add_symlink(bo->hdir, bo->hdev); - list_add_tail(&bo->list, &bdev->bd_holder_list); - return 1; + ret = add_symlink(bo->sdir, bo->sdev); + if (ret == 0) { + ret = add_symlink(bo->hdir, bo->hdev); + if (ret) + del_symlink(bo->sdir, bo->sdev); + } + if (ret == 0) + list_add_tail(&bo->list, &bdev->bd_holder_list); + return ret; } /** @@ -741,7 +749,9 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder, mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION); res = bd_claim(bdev, holder); - if (res || !add_bd_holder(bdev, bo)) + if (res == 0) + res = add_bd_holder(bdev, bo); + if (res) free_bd_holder(bo); mutex_unlock(&bdev->bd_mutex); @@ -1021,7 +1031,7 @@ do_open(struct block_device *bdev, struct file *file, unsigned int subclass) rescan_partitions(bdev->bd_disk, bdev); } else { mutex_lock_nested(&bdev->bd_contains->bd_mutex, - BD_MUTEX_PARTITION); + BD_MUTEX_WHOLE); bdev->bd_contains->bd_part_count++; mutex_unlock(&bdev->bd_contains->bd_mutex); } diff --git a/fs/buffer.c b/fs/buffer.c index 71649ef9b658..3b6d701073e7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2987,6 +2987,7 @@ int try_to_free_buffers(struct page *page) spin_lock(&mapping->private_lock); ret = drop_buffers(page, &buffers_to_free); + spin_unlock(&mapping->private_lock); if (ret) { /* * If the filesystem writes its buffers by hand (eg ext3) @@ -2998,7 +2999,6 @@ int try_to_free_buffers(struct page *page) */ clear_page_dirty(page); } - spin_unlock(&mapping->private_lock); out: if (buffers_to_free) { struct buffer_head *bh = buffers_to_free; diff --git a/fs/char_dev.c b/fs/char_dev.c index 3483d3cf8087..1f3285affa39 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -19,11 +19,30 @@ #include <linux/kobj_map.h> #include <linux/cdev.h> #include <linux/mutex.h> +#include <linux/backing-dev.h> #ifdef CONFIG_KMOD #include <linux/kmod.h> #endif +/* + * capabilities for /dev/mem, /dev/kmem and similar directly mappable character + * devices + * - permits shared-mmap for read, write and/or exec + * - does not permit private mmap in NOMMU mode (can't do COW) + * - no readahead or I/O queue unplugging required + */ +struct backing_dev_info directly_mappable_cdev_bdi = { + .capabilities = ( +#ifdef CONFIG_MMU + /* permit private copies of the data to be taken */ + BDI_CAP_MAP_COPY | +#endif + /* permit direct mmap, for read, write or exec */ + BDI_CAP_MAP_DIRECT | + BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP), +}; + static struct kobj_map *cdev_map; static DEFINE_MUTEX(chrdevs_lock); @@ -109,13 +128,31 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) if ((*cp)->major > major || - ((*cp)->major == major && (*cp)->baseminor >= baseminor)) + ((*cp)->major == major && + (((*cp)->baseminor >= baseminor) || + ((*cp)->baseminor + (*cp)->minorct > baseminor)))) break; - if (*cp && (*cp)->major == major && - (*cp)->baseminor < baseminor + minorct) { - ret = -EBUSY; - goto out; + + /* Check for overlapping minor ranges. */ + if (*cp && (*cp)->major == major) { + int old_min = (*cp)->baseminor; + int old_max = (*cp)->baseminor + (*cp)->minorct - 1; + int new_min = baseminor; + int new_max = baseminor + minorct - 1; + + /* New driver overlaps from the left. */ + if (new_max >= old_min && new_max <= old_max) { + ret = -EBUSY; + goto out; + } + + /* New driver overlaps from the right. */ + if (new_min <= old_max && new_min >= old_min) { + ret = -EBUSY; + goto out; + } } + cd->next = *cp; *cp = cd; mutex_unlock(&chrdevs_lock); @@ -146,6 +183,15 @@ __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) return cd; } +/** + * register_chrdev_region() - register a range of device numbers + * @from: the first in the desired range of device numbers; must include + * the major number. + * @count: the number of consecutive device numbers required + * @name: the name of the device or driver. + * + * Return value is zero on success, a negative error code on failure. + */ int register_chrdev_region(dev_t from, unsigned count, const char *name) { struct char_device_struct *cd; @@ -171,6 +217,17 @@ fail: return PTR_ERR(cd); } +/** + * alloc_chrdev_region() - register a range of char device numbers + * @dev: output parameter for first assigned number + * @baseminor: first of the requested range of minor numbers + * @count: the number of minor numbers required + * @name: the name of the associated device or driver + * + * Allocates a range of char device numbers. The major number will be + * chosen dynamically, and returned (along with the first minor number) + * in @dev. Returns zero or a negative error code. + */ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, const char *name) { @@ -240,6 +297,15 @@ out2: return err; } +/** + * unregister_chrdev_region() - return a range of device numbers + * @from: the first in the range of numbers to unregister + * @count: the number of device numbers to unregister + * + * This function will unregister a range of @count device numbers, + * starting with @from. The caller should normally be the one who + * allocated those numbers in the first place... + */ void unregister_chrdev_region(dev_t from, unsigned count) { dev_t to = from + count; @@ -377,6 +443,16 @@ static int exact_lock(dev_t dev, void *data) return cdev_get(p) ? 0 : -1; } +/** + * cdev_add() - add a char device to the system + * @p: the cdev structure for the device + * @dev: the first device number for which this device is responsible + * @count: the number of consecutive minor numbers corresponding to this + * device + * + * cdev_add() adds the device represented by @p to the system, making it + * live immediately. A negative error code is returned on failure. + */ int cdev_add(struct cdev *p, dev_t dev, unsigned count) { p->dev = dev; @@ -389,6 +465,13 @@ static void cdev_unmap(dev_t dev, unsigned count) kobj_unmap(cdev_map, dev, count); } +/** + * cdev_del() - remove a cdev from the system + * @p: the cdev structure to be removed + * + * cdev_del() removes @p from the system, possibly freeing the structure + * itself. + */ void cdev_del(struct cdev *p) { cdev_unmap(p->dev, p->count); @@ -417,6 +500,11 @@ static struct kobj_type ktype_cdev_dynamic = { .release = cdev_dynamic_release, }; +/** + * cdev_alloc() - allocate a cdev structure + * + * Allocates and returns a cdev structure, or NULL on failure. + */ struct cdev *cdev_alloc(void) { struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); @@ -428,6 +516,14 @@ struct cdev *cdev_alloc(void) return p; } +/** + * cdev_init() - initialize a cdev structure + * @cdev: the structure to initialize + * @fops: the file_operations for this device + * + * Initializes @cdev, remembering @fops, making it ready to add to the + * system with cdev_add(). + */ void cdev_init(struct cdev *cdev, const struct file_operations *fops) { memset(cdev, 0, sizeof *cdev); @@ -461,3 +557,4 @@ EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(register_chrdev); EXPORT_SYMBOL(unregister_chrdev); +EXPORT_SYMBOL(directly_mappable_cdev_bdi); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c3ef1c0d0e68..22bcf4d7e7ae 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -253,7 +253,6 @@ cifs_alloc_inode(struct super_block *sb) file data or metadata */ cifs_inode->clientCanCacheRead = FALSE; cifs_inode->clientCanCacheAll = FALSE; - cifs_inode->vfs_inode.i_blksize = CIFS_MAX_MSGSIZE; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; INIT_LIST_HEAD(&cifs_inode->openFileList); @@ -699,8 +698,7 @@ cifs_init_inodecache(void) static void cifs_destroy_inodecache(void) { - if (kmem_cache_destroy(cifs_inode_cachep)) - printk(KERN_WARNING "cifs_inode_cache: error freeing\n"); + kmem_cache_destroy(cifs_inode_cachep); } static int @@ -778,13 +776,9 @@ static void cifs_destroy_request_bufs(void) { mempool_destroy(cifs_req_poolp); - if (kmem_cache_destroy(cifs_req_cachep)) - printk(KERN_WARNING - "cifs_destroy_request_cache: error not all structures were freed\n"); + kmem_cache_destroy(cifs_req_cachep); mempool_destroy(cifs_sm_req_poolp); - if (kmem_cache_destroy(cifs_sm_req_cachep)) - printk(KERN_WARNING - "cifs_destroy_request_cache: cifs_small_rq free error\n"); + kmem_cache_destroy(cifs_sm_req_cachep); } static int @@ -819,13 +813,8 @@ static void cifs_destroy_mids(void) { mempool_destroy(cifs_mid_poolp); - if (kmem_cache_destroy(cifs_mid_cachep)) - printk(KERN_WARNING - "cifs_destroy_mids: error not all structures were freed\n"); - - if (kmem_cache_destroy(cifs_oplock_cachep)) - printk(KERN_WARNING - "error not all oplock structures were freed\n"); + kmem_cache_destroy(cifs_mid_cachep); + kmem_cache_destroy(cifs_oplock_cachep); } static int cifs_oplock_thread(void * dummyarg) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 9aeb58a7d369..b27b34537bf2 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -216,10 +216,9 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, if (allocation_size < end_of_file) cFYI(1, ("May be sparse file, allocation less than file size")); - cFYI(1, ("File Size %ld and blocks %llu and blocksize %ld", + cFYI(1, ("File Size %ld and blocks %llu", (unsigned long)tmp_inode->i_size, - (unsigned long long)tmp_inode->i_blocks, - tmp_inode->i_blksize)); + (unsigned long long)tmp_inode->i_blocks)); if (S_ISREG(tmp_inode->i_mode)) { cFYI(1, ("File inode")); tmp_inode->i_op = &cifs_file_inode_ops; diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 5597080cb811..95a54253c047 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -110,8 +110,6 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) inode->i_nlink = attr->va_nlink; if (attr->va_size != -1) inode->i_size = attr->va_size; - if (attr->va_blocksize != -1) - inode->i_blksize = attr->va_blocksize; if (attr->va_size != -1) inode->i_blocks = (attr->va_size + 511) >> 9; if (attr->va_atime.tv_sec != -1) diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 71f2ea632e53..8651ea6a23b7 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -513,7 +513,7 @@ static int coda_venus_readdir(struct file *filp, filldir_t filldir, ino_t ino; int ret, i; - vdir = (struct venus_dirent *)kmalloc(sizeof(*vdir), GFP_KERNEL); + vdir = kmalloc(sizeof(*vdir), GFP_KERNEL); if (!vdir) return -ENOMEM; i = filp->f_pos; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 87f1dc8aa24b..88d123321164 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -80,8 +80,7 @@ int coda_init_inodecache(void) void coda_destroy_inodecache(void) { - if (kmem_cache_destroy(coda_inode_cachep)) - printk(KERN_INFO "coda_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(coda_inode_cachep); } static int coda_remount(struct super_block *sb, int *flags, char *data) diff --git a/fs/compat.c b/fs/compat.c index e31e9cf96647..ce982f6e8c80 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1855,7 +1855,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec)); - if (tsp && !(current->personality & STICKY_TIMEOUTS)) { + if (ret == 0 && tsp && !(current->personality & STICKY_TIMEOUTS)) { struct compat_timespec rts; rts.tv_sec = timeout / HZ; @@ -1866,7 +1866,8 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, } if (compat_timespec_compare(&rts, &ts) >= 0) rts = ts; - copy_to_user(tsp, &rts, sizeof(rts)); + if (copy_to_user(tsp, &rts, sizeof(rts))) + ret = -EFAULT; } if (ret == -ERESTARTNOHAND) { diff --git a/fs/configfs/file.c b/fs/configfs/file.c index f499803743e0..85105e50f7db 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -274,9 +274,8 @@ static int check_perm(struct inode * inode, struct file * file) /* No error? Great, allocate a buffer for the file, and store it * it in file->private_data for easy access. */ - buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL); + buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL); if (buffer) { - memset(buffer,0,sizeof(struct configfs_buffer)); init_MUTEX(&buffer->sem); buffer->needs_read_fill = 1; buffer->ops = ops; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index e14488ca6411..fb18917954a9 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -76,11 +76,10 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) if (!sd_iattr) { /* setting attributes for the first time, allocate now */ - sd_iattr = kmalloc(sizeof(struct iattr), GFP_KERNEL); + sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); if (!sd_iattr) return -ENOMEM; /* assign default attributes */ - memset(sd_iattr, 0, sizeof(struct iattr)); sd_iattr->ia_mode = sd->s_mode; sd_iattr->ia_uid = 0; sd_iattr->ia_gid = 0; @@ -136,7 +135,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd) { struct inode * inode = new_inode(configfs_sb); if (inode) { - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_mapping->a_ops = &configfs_aops; inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 223c0431042d..a624c3ec8189 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -73,7 +73,6 @@ static int cramfs_iget5_set(struct inode *inode, void *opaque) inode->i_uid = cramfs_inode->uid; inode->i_size = cramfs_inode->size; inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_gid = cramfs_inode->gid; /* Struct copy intentional */ inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; @@ -242,11 +241,10 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags |= MS_RDONLY; - sbi = kmalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; - memset(sbi, 0, sizeof(struct cramfs_sb_info)); /* Invalidate the read buffers on mount: think disk change.. */ mutex_lock(&read_mutex); @@ -545,8 +543,15 @@ static struct file_system_type cramfs_fs_type = { static int __init init_cramfs_fs(void) { - cramfs_uncompress_init(); - return register_filesystem(&cramfs_fs_type); + int rv; + + rv = cramfs_uncompress_init(); + if (rv < 0) + return rv; + rv = register_filesystem(&cramfs_fs_type); + if (rv < 0) + cramfs_uncompress_exit(); + return rv; } static void __exit exit_cramfs_fs(void) diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c index 8def89f2c438..fc3ccb74626f 100644 --- a/fs/cramfs/uncompress.c +++ b/fs/cramfs/uncompress.c @@ -68,11 +68,10 @@ int cramfs_uncompress_init(void) return 0; } -int cramfs_uncompress_exit(void) +void cramfs_uncompress_exit(void) { if (!--initialized) { zlib_inflateEnd(&stream); vfree(stream.workspace); } - return 0; } diff --git a/fs/dcache.c b/fs/dcache.c index 1b4a3a34ec57..17b392a2049e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -828,17 +828,19 @@ void d_instantiate(struct dentry *entry, struct inode * inode) * (or otherwise set) by the caller to indicate that it is now * in use by the dcache. */ -struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) +static struct dentry *__d_instantiate_unique(struct dentry *entry, + struct inode *inode) { struct dentry *alias; int len = entry->d_name.len; const char *name = entry->d_name.name; unsigned int hash = entry->d_name.hash; - BUG_ON(!list_empty(&entry->d_alias)); - spin_lock(&dcache_lock); - if (!inode) - goto do_negative; + if (!inode) { + entry->d_inode = NULL; + return NULL; + } + list_for_each_entry(alias, &inode->i_dentry, d_alias) { struct qstr *qstr = &alias->d_name; @@ -851,19 +853,35 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) if (memcmp(qstr->name, name, len)) continue; dget_locked(alias); - spin_unlock(&dcache_lock); - BUG_ON(!d_unhashed(alias)); - iput(inode); return alias; } + list_add(&entry->d_alias, &inode->i_dentry); -do_negative: entry->d_inode = inode; fsnotify_d_instantiate(entry, inode); - spin_unlock(&dcache_lock); - security_d_instantiate(entry, inode); return NULL; } + +struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) +{ + struct dentry *result; + + BUG_ON(!list_empty(&entry->d_alias)); + + spin_lock(&dcache_lock); + result = __d_instantiate_unique(entry, inode); + spin_unlock(&dcache_lock); + + if (!result) { + security_d_instantiate(entry, inode); + return NULL; + } + + BUG_ON(!d_unhashed(result)); + iput(inode); + return result; +} + EXPORT_SYMBOL(d_instantiate_unique); /** @@ -1235,6 +1253,11 @@ static void __d_rehash(struct dentry * entry, struct hlist_head *list) hlist_add_head_rcu(&entry->d_hash, list); } +static void _d_rehash(struct dentry * entry) +{ + __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash)); +} + /** * d_rehash - add an entry back to the hash * @entry: dentry to add to the hash @@ -1244,11 +1267,9 @@ static void __d_rehash(struct dentry * entry, struct hlist_head *list) void d_rehash(struct dentry * entry) { - struct hlist_head *list = d_hash(entry->d_parent, entry->d_name.hash); - spin_lock(&dcache_lock); spin_lock(&entry->d_lock); - __d_rehash(entry, list); + _d_rehash(entry); spin_unlock(&entry->d_lock); spin_unlock(&dcache_lock); } @@ -1386,6 +1407,120 @@ already_unhashed: spin_unlock(&dcache_lock); } +/* + * Prepare an anonymous dentry for life in the superblock's dentry tree as a + * named dentry in place of the dentry to be replaced. + */ +static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) +{ + struct dentry *dparent, *aparent; + + switch_names(dentry, anon); + do_switch(dentry->d_name.len, anon->d_name.len); + do_switch(dentry->d_name.hash, anon->d_name.hash); + + dparent = dentry->d_parent; + aparent = anon->d_parent; + + dentry->d_parent = (aparent == anon) ? dentry : aparent; + list_del(&dentry->d_u.d_child); + if (!IS_ROOT(dentry)) + list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); + else + INIT_LIST_HEAD(&dentry->d_u.d_child); + + anon->d_parent = (dparent == dentry) ? anon : dparent; + list_del(&anon->d_u.d_child); + if (!IS_ROOT(anon)) + list_add(&anon->d_u.d_child, &anon->d_parent->d_subdirs); + else + INIT_LIST_HEAD(&anon->d_u.d_child); + + anon->d_flags &= ~DCACHE_DISCONNECTED; +} + +/** + * d_materialise_unique - introduce an inode into the tree + * @dentry: candidate dentry + * @inode: inode to bind to the dentry, to which aliases may be attached + * + * Introduces an dentry into the tree, substituting an extant disconnected + * root directory alias in its place if there is one + */ +struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) +{ + struct dentry *alias, *actual; + + BUG_ON(!d_unhashed(dentry)); + + spin_lock(&dcache_lock); + + if (!inode) { + actual = dentry; + dentry->d_inode = NULL; + goto found_lock; + } + + /* See if a disconnected directory already exists as an anonymous root + * that we should splice into the tree instead */ + if (S_ISDIR(inode->i_mode) && (alias = __d_find_alias(inode, 1))) { + spin_lock(&alias->d_lock); + + /* Is this a mountpoint that we could splice into our tree? */ + if (IS_ROOT(alias)) + goto connect_mountpoint; + + if (alias->d_name.len == dentry->d_name.len && + alias->d_parent == dentry->d_parent && + memcmp(alias->d_name.name, + dentry->d_name.name, + dentry->d_name.len) == 0) + goto replace_with_alias; + + spin_unlock(&alias->d_lock); + + /* Doh! Seem to be aliasing directories for some reason... */ + dput(alias); + } + + /* Add a unique reference */ + actual = __d_instantiate_unique(dentry, inode); + if (!actual) + actual = dentry; + else if (unlikely(!d_unhashed(actual))) + goto shouldnt_be_hashed; + +found_lock: + spin_lock(&actual->d_lock); +found: + _d_rehash(actual); + spin_unlock(&actual->d_lock); + spin_unlock(&dcache_lock); + + if (actual == dentry) { + security_d_instantiate(dentry, inode); + return NULL; + } + + iput(inode); + return actual; + + /* Convert the anonymous/root alias into an ordinary dentry */ +connect_mountpoint: + __d_materialise_dentry(dentry, alias); + + /* Replace the candidate dentry with the alias in the tree */ +replace_with_alias: + __d_drop(alias); + actual = alias; + goto found; + +shouldnt_be_hashed: + spin_unlock(&dcache_lock); + BUG(); + goto shouldnt_be_hashed; +} + /** * d_path - return the path of a dentry * @dentry: dentry to report @@ -1784,6 +1919,7 @@ EXPORT_SYMBOL(d_instantiate); EXPORT_SYMBOL(d_invalidate); EXPORT_SYMBOL(d_lookup); EXPORT_SYMBOL(d_move); +EXPORT_SYMBOL_GPL(d_materialise_unique); EXPORT_SYMBOL(d_path); EXPORT_SYMBOL(d_prune_aliases); EXPORT_SYMBOL(d_rehash); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 39640fd03458..bf3901ab1744 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -32,8 +32,8 @@ static ssize_t default_write_file(struct file *file, const char __user *buf, static int default_open(struct inode *inode, struct file *file) { - if (inode->u.generic_ip) - file->private_data = inode->u.generic_ip; + if (inode->i_private) + file->private_data = inode->i_private; return 0; } @@ -55,12 +55,11 @@ static u64 debugfs_u8_get(void *data) DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); /** - * debugfs_create_u8 - create a file in the debugfs filesystem that is used to read and write an unsigned 8 bit value. - * + * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this paramater is NULL, then the + * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. @@ -72,11 +71,11 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, NULL will be returned. + * you are responsible here.) If an error occurs, %NULL will be returned. * - * If debugfs is not enabled in the kernel, the value -ENODEV will be + * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. It is not wise to check for this value, but rather, check for - * NULL or !NULL instead as to eliminate the need for #ifdef in the calling + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ struct dentry *debugfs_create_u8(const char *name, mode_t mode, @@ -97,12 +96,11 @@ static u64 debugfs_u16_get(void *data) DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); /** - * debugfs_create_u16 - create a file in the debugfs filesystem that is used to read and write an unsigned 16 bit value. - * + * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this paramater is NULL, then the + * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. @@ -114,11 +112,11 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, NULL will be returned. + * you are responsible here.) If an error occurs, %NULL will be returned. * - * If debugfs is not enabled in the kernel, the value -ENODEV will be + * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. It is not wise to check for this value, but rather, check for - * NULL or !NULL instead as to eliminate the need for #ifdef in the calling + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ struct dentry *debugfs_create_u16(const char *name, mode_t mode, @@ -139,12 +137,11 @@ static u64 debugfs_u32_get(void *data) DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); /** - * debugfs_create_u32 - create a file in the debugfs filesystem that is used to read and write an unsigned 32 bit value. - * + * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this paramater is NULL, then the + * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. @@ -156,11 +153,11 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, NULL will be returned. + * you are responsible here.) If an error occurs, %NULL will be returned. * - * If debugfs is not enabled in the kernel, the value -ENODEV will be + * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. It is not wise to check for this value, but rather, check for - * NULL or !NULL instead as to eliminate the need for #ifdef in the calling + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ struct dentry *debugfs_create_u32(const char *name, mode_t mode, @@ -219,12 +216,11 @@ static const struct file_operations fops_bool = { }; /** - * debugfs_create_bool - create a file in the debugfs filesystem that is used to read and write a boolean value. - * + * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this paramater is NULL, then the + * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. @@ -236,11 +232,11 @@ static const struct file_operations fops_bool = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, NULL will be returned. + * you are responsible here.) If an error occurs, %NULL will be returned. * - * If debugfs is not enabled in the kernel, the value -ENODEV will be + * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. It is not wise to check for this value, but rather, check for - * NULL or !NULL instead as to eliminate the need for #ifdef in the calling + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ struct dentry *debugfs_create_bool(const char *name, mode_t mode, @@ -264,13 +260,11 @@ static struct file_operations fops_blob = { }; /** - * debugfs_create_blob - create a file in the debugfs filesystem that is - * used to read and write a binary blob. - * + * debugfs_create_blob - create a debugfs file that is used to read and write a binary blob * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a - * directory dentry if set. If this paramater is NULL, then the + * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer * to the blob data and the size of the data. @@ -282,11 +276,11 @@ static struct file_operations fops_blob = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, NULL will be returned. + * you are responsible here.) If an error occurs, %NULL will be returned. * - * If debugfs is not enabled in the kernel, the value -ENODEV will be + * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. It is not wise to check for this value, but rather, check for - * NULL or !NULL instead as to eliminate the need for #ifdef in the calling + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ struct dentry *debugfs_create_blob(const char *name, mode_t mode, diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index e8ae3042b806..269e649e6dc6 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -40,7 +40,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d inode->i_mode = mode; inode->i_uid = 0; inode->i_gid = 0; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { @@ -162,14 +161,13 @@ static int debugfs_create_by_name(const char *name, mode_t mode, /** * debugfs_create_file - create a file in the debugfs filesystem - * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this paramater is NULL, then the * file will be created in the root of the debugfs filesystem. * @data: a pointer to something that the caller will want to get to later - * on. The inode.u.generic_ip pointer will point to this value on + * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. @@ -182,11 +180,11 @@ static int debugfs_create_by_name(const char *name, mode_t mode, * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, NULL will be returned. + * you are responsible here.) If an error occurs, %NULL will be returned. * - * If debugfs is not enabled in the kernel, the value -ENODEV will be + * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. It is not wise to check for this value, but rather, check for - * NULL or !NULL instead as to eliminate the need for #ifdef in the calling + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ struct dentry *debugfs_create_file(const char *name, mode_t mode, @@ -210,7 +208,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode, if (dentry->d_inode) { if (data) - dentry->d_inode->u.generic_ip = data; + dentry->d_inode->i_private = data; if (fops) dentry->d_inode->i_fop = fops; } @@ -221,7 +219,6 @@ EXPORT_SYMBOL_GPL(debugfs_create_file); /** * debugfs_create_dir - create a directory in the debugfs filesystem - * * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a @@ -233,11 +230,11 @@ EXPORT_SYMBOL_GPL(debugfs_create_file); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, NULL will be returned. + * you are responsible here.) If an error occurs, %NULL will be returned. * - * If debugfs is not enabled in the kernel, the value -ENODEV will be + * If debugfs is not enabled in the kernel, the value -%ENODEV will be * returned. It is not wise to check for this value, but rather, check for - * NULL or !NULL instead as to eliminate the need for #ifdef in the calling + * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling * code. */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) @@ -250,7 +247,6 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir); /** * debugfs_remove - removes a file or directory from the debugfs filesystem - * * @dentry: a pointer to a the dentry of the file or directory to be * removed. * diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index f7aef5bb584a..5f7b5a6025bf 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -113,7 +113,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent) inode->i_ino = 1; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_blocks = 0; - inode->i_blksize = 1024; inode->i_uid = inode->i_gid = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; @@ -172,12 +171,11 @@ int devpts_pty_new(struct tty_struct *tty) return -ENOMEM; inode->i_ino = number+2; - inode->i_blksize = 1024; inode->i_uid = config.setuid ? config.uid : current->fsuid; inode->i_gid = config.setgid ? config.gid : current->fsgid; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; init_special_inode(inode, S_IFCHR|config.mode, device); - inode->u.generic_ip = tty; + inode->i_private = tty; dentry = get_node(number); if (!IS_ERR(dentry) && !dentry->d_inode) @@ -196,7 +194,7 @@ struct tty_struct *devpts_get_tty(int number) tty = NULL; if (!IS_ERR(dentry)) { if (dentry->d_inode) - tty = dentry->d_inode->u.generic_ip; + tty = dentry->d_inode->i_private; dput(dentry); } diff --git a/fs/dquot.c b/fs/dquot.c index 0122a279106a..9af789567e51 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -834,6 +834,9 @@ static void print_warning(struct dquot *dquot, const char warntype) if (!need_print_warning(dquot) || (flag && test_and_set_bit(flag, &dquot->dq_flags))) return; + mutex_lock(&tty_mutex); + if (!current->signal->tty) + goto out_lock; tty_write_message(current->signal->tty, dquot->dq_sb->s_id); if (warntype == ISOFTWARN || warntype == BSOFTWARN) tty_write_message(current->signal->tty, ": warning, "); @@ -861,6 +864,8 @@ static void print_warning(struct dquot *dquot, const char warntype) break; } tty_write_message(current->signal->tty, msg); +out_lock: + mutex_unlock(&tty_mutex); } static inline void flush_warnings(struct dquot **dquots, char *warntype) diff --git a/fs/efs/super.c b/fs/efs/super.c index 8ac2462ae5dd..b3f50651eb6b 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -90,8 +90,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(efs_inode_cachep)) - printk(KERN_INFO "efs_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(efs_inode_cachep); } static void efs_put_super(struct super_block *s) @@ -248,11 +247,10 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) struct buffer_head *bh; struct inode *root; - sb = kmalloc(sizeof(struct efs_sb_info), GFP_KERNEL); + sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); if (!sb) return -ENOMEM; s->s_fs_info = sb; - memset(sb, 0, sizeof(struct efs_sb_info)); s->s_magic = EFS_SUPER_MAGIC; if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3a3567433b92..8d544334bcd2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1590,7 +1590,6 @@ static struct inode *ep_eventpoll_inode(void) inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_blksize = PAGE_SIZE; return inode; eexit_1: diff --git a/fs/exec.c b/fs/exec.c index 54135df2a966..a8efe35176b0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -595,7 +595,7 @@ static int de_thread(struct task_struct *tsk) if (!newsighand) return -ENOMEM; - if (thread_group_empty(current)) + if (thread_group_empty(tsk)) goto no_thread_group; /* @@ -620,17 +620,17 @@ static int de_thread(struct task_struct *tsk) * Reparenting needs write_lock on tasklist_lock, * so it is safe to do it under read_lock. */ - if (unlikely(current->group_leader == child_reaper)) - child_reaper = current; + if (unlikely(tsk->group_leader == child_reaper)) + child_reaper = tsk; - zap_other_threads(current); + zap_other_threads(tsk); read_unlock(&tasklist_lock); /* * Account for the thread group leader hanging around: */ count = 1; - if (!thread_group_leader(current)) { + if (!thread_group_leader(tsk)) { count = 2; /* * The SIGALRM timer survives the exec, but needs to point @@ -639,14 +639,14 @@ static int de_thread(struct task_struct *tsk) * synchronize with any firing (by calling del_timer_sync) * before we can safely let the old group leader die. */ - sig->tsk = current; + sig->tsk = tsk; spin_unlock_irq(lock); if (hrtimer_cancel(&sig->real_timer)) hrtimer_restart(&sig->real_timer); spin_lock_irq(lock); } while (atomic_read(&sig->count) > count) { - sig->group_exit_task = current; + sig->group_exit_task = tsk; sig->notify_count = count; __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock_irq(lock); @@ -662,13 +662,13 @@ static int de_thread(struct task_struct *tsk) * do is to wait for the thread group leader to become inactive, * and to assume its PID: */ - if (!thread_group_leader(current)) { + if (!thread_group_leader(tsk)) { /* * Wait for the thread group leader to be a zombie. * It should already be zombie at this point, most * of the time. */ - leader = current->group_leader; + leader = tsk->group_leader; while (leader->exit_state != EXIT_ZOMBIE) yield(); @@ -682,12 +682,12 @@ static int de_thread(struct task_struct *tsk) * When we take on its identity by switching to its PID, we * also take its birthdate (always earlier than our own). */ - current->start_time = leader->start_time; + tsk->start_time = leader->start_time; write_lock_irq(&tasklist_lock); - BUG_ON(leader->tgid != current->tgid); - BUG_ON(current->pid == current->tgid); + BUG_ON(leader->tgid != tsk->tgid); + BUG_ON(tsk->pid == tsk->tgid); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the @@ -696,24 +696,21 @@ static int de_thread(struct task_struct *tsk) */ /* Become a process group leader with the old leader's pid. - * Note: The old leader also uses thispid until release_task + * The old leader becomes a thread of the this thread group. + * Note: The old leader also uses this pid until release_task * is called. Odd but simple and correct. */ - detach_pid(current, PIDTYPE_PID); - current->pid = leader->pid; - attach_pid(current, PIDTYPE_PID, current->pid); - attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); - attach_pid(current, PIDTYPE_SID, current->signal->session); - list_replace_rcu(&leader->tasks, ¤t->tasks); + detach_pid(tsk, PIDTYPE_PID); + tsk->pid = leader->pid; + attach_pid(tsk, PIDTYPE_PID, tsk->pid); + transfer_pid(leader, tsk, PIDTYPE_PGID); + transfer_pid(leader, tsk, PIDTYPE_SID); + list_replace_rcu(&leader->tasks, &tsk->tasks); - current->group_leader = current; - leader->group_leader = current; + tsk->group_leader = tsk; + leader->group_leader = tsk; - /* Reduce leader to a thread */ - detach_pid(leader, PIDTYPE_PGID); - detach_pid(leader, PIDTYPE_SID); - - current->exit_signal = SIGCHLD; + tsk->exit_signal = SIGCHLD; BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; @@ -753,7 +750,7 @@ no_thread_group: spin_lock(&oldsighand->siglock); spin_lock_nested(&newsighand->siglock, SINGLE_DEPTH_NESTING); - rcu_assign_pointer(current->sighand, newsighand); + rcu_assign_pointer(tsk->sighand, newsighand); recalc_sigpending(); spin_unlock(&newsighand->siglock); @@ -764,7 +761,7 @@ no_thread_group: kmem_cache_free(sighand_cachep, oldsighand); } - BUG_ON(!thread_group_leader(current)); + BUG_ON(!thread_group_leader(tsk)); return 0; } @@ -901,8 +898,7 @@ int flush_old_exec(struct linux_binprm * bprm) return 0; mmap_failed: - put_files_struct(current->files); - current->files = files; + reset_files_struct(current, files); out: return retval; } diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index da52b4a5db64..7c420b800c34 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -89,8 +89,8 @@ ext2_acl_to_disk(const struct posix_acl *acl, size_t *size) size_t n; *size = ext2_acl_size(acl->a_count); - ext_acl = (ext2_acl_header *)kmalloc(sizeof(ext2_acl_header) + - acl->a_count * sizeof(ext2_acl_entry), GFP_KERNEL); + ext_acl = kmalloc(sizeof(ext2_acl_header) + acl->a_count * + sizeof(ext2_acl_entry), GFP_KERNEL); if (!ext_acl) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(EXT2_ACL_VERSION); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 695f69ccf908..2cb545bf0f3c 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -574,7 +574,6 @@ got: inode->i_mode = mode; inode->i_ino = ino; - inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; memset(ei->i_data, 0, sizeof(ei->i_data)); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index fb4d3220eb8d..dd4e14c221e0 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1094,7 +1094,6 @@ void ext2_read_inode (struct inode * inode) brelse (bh); goto bad_inode; } - inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); ei->i_flags = le32_to_cpu(raw_inode->i_flags); ei->i_faddr = le32_to_cpu(raw_inode->i_faddr); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 4286ff6330b6..513cd421ac0b 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -184,8 +184,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(ext2_inode_cachep)) - printk(KERN_INFO "ext2_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(ext2_inode_cachep); } static void ext2_clear_inode(struct inode *inode) @@ -544,17 +543,24 @@ static int ext2_check_descriptors (struct super_block * sb) int i; int desc_block = 0; struct ext2_sb_info *sbi = EXT2_SB(sb); - unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); + unsigned long first_block = le32_to_cpu(sbi->s_es->s_first_data_block); + unsigned long last_block; struct ext2_group_desc * gdp = NULL; ext2_debug ("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { + if (i == sbi->s_groups_count - 1) + last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; + else + last_block = first_block + + (EXT2_BLOCKS_PER_GROUP(sb) - 1); + if ((i % EXT2_DESC_PER_BLOCK(sb)) == 0) gdp = (struct ext2_group_desc *) sbi->s_group_desc[desc_block++]->b_data; - if (le32_to_cpu(gdp->bg_block_bitmap) < block || - le32_to_cpu(gdp->bg_block_bitmap) >= block + EXT2_BLOCKS_PER_GROUP(sb)) + if (le32_to_cpu(gdp->bg_block_bitmap) < first_block || + le32_to_cpu(gdp->bg_block_bitmap) > last_block) { ext2_error (sb, "ext2_check_descriptors", "Block bitmap for group %d" @@ -562,8 +568,8 @@ static int ext2_check_descriptors (struct super_block * sb) i, (unsigned long) le32_to_cpu(gdp->bg_block_bitmap)); return 0; } - if (le32_to_cpu(gdp->bg_inode_bitmap) < block || - le32_to_cpu(gdp->bg_inode_bitmap) >= block + EXT2_BLOCKS_PER_GROUP(sb)) + if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block || + le32_to_cpu(gdp->bg_inode_bitmap) > last_block) { ext2_error (sb, "ext2_check_descriptors", "Inode bitmap for group %d" @@ -571,9 +577,9 @@ static int ext2_check_descriptors (struct super_block * sb) i, (unsigned long) le32_to_cpu(gdp->bg_inode_bitmap)); return 0; } - if (le32_to_cpu(gdp->bg_inode_table) < block || - le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >= - block + EXT2_BLOCKS_PER_GROUP(sb)) + if (le32_to_cpu(gdp->bg_inode_table) < first_block || + le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group > + last_block) { ext2_error (sb, "ext2_check_descriptors", "Inode table for group %d" @@ -581,7 +587,7 @@ static int ext2_check_descriptors (struct super_block * sb) i, (unsigned long) le32_to_cpu(gdp->bg_inode_table)); return 0; } - block += EXT2_BLOCKS_PER_GROUP(sb); + first_block += EXT2_BLOCKS_PER_GROUP(sb); gdp++; } return 1; @@ -648,11 +654,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) int i, j; __le32 features; - sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; - memset(sbi, 0, sizeof(*sbi)); /* * See what the current blocksize for the device is, and @@ -861,10 +866,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (EXT2_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext2; - sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_first_data_block) + - EXT2_BLOCKS_PER_GROUP(sb) - 1) / - EXT2_BLOCKS_PER_GROUP(sb); + sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) - 1) + / EXT2_BLOCKS_PER_GROUP(sb)) + 1; db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) / EXT2_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL); diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 86ae8e93adb9..af52a7f8b291 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -521,11 +521,10 @@ bad_block: ext2_error(sb, "ext2_xattr_set", } } else { /* Allocate a buffer where we construct the new block. */ - header = kmalloc(sb->s_blocksize, GFP_KERNEL); + header = kzalloc(sb->s_blocksize, GFP_KERNEL); error = -ENOMEM; if (header == NULL) goto cleanup; - memset(header, 0, sb->s_blocksize); end = (char *)header + sb->s_blocksize; header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC); header->h_blocks = header->h_refcount = cpu_to_le32(1); diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 0d21d558b87a..1e5038d9a01b 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -90,8 +90,8 @@ ext3_acl_to_disk(const struct posix_acl *acl, size_t *size) size_t n; *size = ext3_acl_size(acl->a_count); - ext_acl = (ext3_acl_header *)kmalloc(sizeof(ext3_acl_header) + - acl->a_count * sizeof(ext3_acl_entry), GFP_KERNEL); + ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count * + sizeof(ext3_acl_entry), GFP_KERNEL); if (!ext_acl) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION); @@ -258,7 +258,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, default: return -EINVAL; } - if (acl) { + if (acl) { value = ext3_acl_to_disk(acl, &size); if (IS_ERR(value)) return (int)PTR_ERR(value); diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 063d994bda0b..b41a7d7e20f0 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -38,6 +38,13 @@ #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) +/** + * ext3_get_group_desc() -- load group descriptor from disk + * @sb: super block + * @block_group: given block group + * @bh: pointer to the buffer head to store the block + * group descriptor + */ struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, unsigned int block_group, struct buffer_head ** bh) @@ -73,8 +80,12 @@ struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, return desc + offset; } -/* - * Read the bitmap for a given block_group, reading into the specified +/** + * read_block_bitmap() + * @sb: super block + * @block_group: given block group + * + * Read the bitmap for a given block_group, reading into the specified * slot in the superblock's bitmap cache. * * Return buffer_head on success or NULL in case of failure. @@ -103,15 +114,22 @@ error_out: * Operations include: * dump, find, add, remove, is_empty, find_next_reservable_window, etc. * - * We use sorted double linked list for the per-filesystem reservation - * window list. (like in vm_region). + * We use a red-black tree to represent per-filesystem reservation + * windows. + * + */ + +/** + * __rsv_window_dump() -- Dump the filesystem block allocation reservation map + * @rb_root: root of per-filesystem reservation rb tree + * @verbose: verbose mode + * @fn: function which wishes to dump the reservation map * - * Initially, we keep those small operations in the abstract functions, - * so later if we need a better searching tree than double linked-list, - * we could easily switch to that without changing too much - * code. + * If verbose is turned on, it will print the whole block reservation + * windows(start, end). Otherwise, it will only print out the "bad" windows, + * those windows that overlap with their immediate neighbors. */ -#if 0 +#if 1 static void __rsv_window_dump(struct rb_root *root, int verbose, const char *fn) { @@ -129,7 +147,7 @@ restart: rsv = list_entry(n, struct ext3_reserve_window_node, rsv_node); if (verbose) printk("reservation window 0x%p " - "start: %d, end: %d\n", + "start: %lu, end: %lu\n", rsv, rsv->rsv_start, rsv->rsv_end); if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) { printk("Bad reservation %p (start >= end)\n", @@ -161,6 +179,22 @@ restart: #define rsv_window_dump(root, verbose) do {} while (0) #endif +/** + * goal_in_my_reservation() + * @rsv: inode's reservation window + * @grp_goal: given goal block relative to the allocation block group + * @group: the current allocation block group + * @sb: filesystem super block + * + * Test if the given goal block (group relative) is within the file's + * own block reservation window range. + * + * If the reservation window is outside the goal allocation group, return 0; + * grp_goal (given goal block) could be -1, which means no specific + * goal block. In this case, always return 1. + * If the goal block is within the reservation window, return 1; + * otherwise, return 0; + */ static int goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal, unsigned int group, struct super_block * sb) @@ -168,7 +202,7 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal, ext3_fsblk_t group_first_block, group_last_block; group_first_block = ext3_group_first_block_no(sb, group); - group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1; + group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); if ((rsv->_rsv_start > group_last_block) || (rsv->_rsv_end < group_first_block)) @@ -179,7 +213,11 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal, return 1; } -/* +/** + * search_reserve_window() + * @rb_root: root of reservation tree + * @goal: target allocation block + * * Find the reserved window which includes the goal, or the previous one * if the goal is not in any window. * Returns NULL if there are no windows or if all windows start after the goal. @@ -216,6 +254,13 @@ search_reserve_window(struct rb_root *root, ext3_fsblk_t goal) return rsv; } +/** + * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree. + * @sb: super block + * @rsv: reservation window to add + * + * Must be called with rsv_lock hold. + */ void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv) { @@ -236,14 +281,25 @@ void ext3_rsv_window_add(struct super_block *sb, p = &(*p)->rb_left; else if (start > this->rsv_end) p = &(*p)->rb_right; - else + else { + rsv_window_dump(root, 1); BUG(); + } } rb_link_node(node, parent, p); rb_insert_color(node, root); } +/** + * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree + * @sb: super block + * @rsv: reservation window to remove + * + * Mark the block reservation window as not allocated, and unlink it + * from the filesystem reservation window rb tree. Must be called with + * rsv_lock hold. + */ static void rsv_window_remove(struct super_block *sb, struct ext3_reserve_window_node *rsv) { @@ -253,11 +309,39 @@ static void rsv_window_remove(struct super_block *sb, rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root); } +/* + * rsv_is_empty() -- Check if the reservation window is allocated. + * @rsv: given reservation window to check + * + * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED. + */ static inline int rsv_is_empty(struct ext3_reserve_window *rsv) { /* a valid reservation end block could not be 0 */ - return (rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED); + return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED; } + +/** + * ext3_init_block_alloc_info() + * @inode: file inode structure + * + * Allocate and initialize the reservation window structure, and + * link the window to the ext3 inode structure at last + * + * The reservation window structure is only dynamically allocated + * and linked to ext3 inode the first time the open file + * needs a new block. So, before every ext3_new_block(s) call, for + * regular files, we should check whether the reservation window + * structure exists or not. In the latter case, this function is called. + * Fail to do so will result in block reservation being turned off for that + * open file. + * + * This function is called from ext3_get_blocks_handle(), also called + * when setting the reservation window size through ioctl before the file + * is open for write (needs block allocation). + * + * Needs truncate_mutex protection prior to call this function. + */ void ext3_init_block_alloc_info(struct inode *inode) { struct ext3_inode_info *ei = EXT3_I(inode); @@ -271,7 +355,7 @@ void ext3_init_block_alloc_info(struct inode *inode) rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED; - /* + /* * if filesystem is mounted with NORESERVATION, the goal * reservation window size is set to zero to indicate * block reservation is off @@ -287,6 +371,19 @@ void ext3_init_block_alloc_info(struct inode *inode) ei->i_block_alloc_info = block_i; } +/** + * ext3_discard_reservation() + * @inode: inode + * + * Discard(free) block reservation window on last file close, or truncate + * or at last iput(). + * + * It is being called in three cases: + * ext3_release_file(): last writer close the file + * ext3_clear_inode(): last iput(), when nobody link to this file. + * ext3_truncate(): when the block indirect map is about to change. + * + */ void ext3_discard_reservation(struct inode *inode) { struct ext3_inode_info *ei = EXT3_I(inode); @@ -306,7 +403,14 @@ void ext3_discard_reservation(struct inode *inode) } } -/* Free given blocks, update quota and i_blocks field */ +/** + * ext3_free_blocks_sb() -- Free given blocks and update quota + * @handle: handle to this transaction + * @sb: super block + * @block: start physcial block to free + * @count: number of blocks to free + * @pdquot_freed_blocks: pointer to quota + */ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb, ext3_fsblk_t block, unsigned long count, unsigned long *pdquot_freed_blocks) @@ -419,8 +523,8 @@ do_more: } /* @@@ This prevents newly-allocated data from being * freed and then reallocated within the same - * transaction. - * + * transaction. + * * Ideally we would want to allow that to happen, but to * do so requires making journal_forget() capable of * revoking the queued write of a data block, which @@ -433,7 +537,7 @@ do_more: * safe not to set the allocation bit in the committed * bitmap, because we know that there is no outstanding * activity on the buffer any more and so it is safe to - * reallocate it. + * reallocate it. */ BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); J_ASSERT_BH(bitmap_bh, @@ -490,7 +594,13 @@ error_return: return; } -/* Free given blocks, update quota and i_blocks field */ +/** + * ext3_free_blocks() -- Free given blocks and update quota + * @handle: handle for this transaction + * @inode: inode + * @block: start physical block to free + * @count: number of blocks to count + */ void ext3_free_blocks(handle_t *handle, struct inode *inode, ext3_fsblk_t block, unsigned long count) { @@ -508,7 +618,11 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode, return; } -/* +/** + * ext3_test_allocatable() + * @nr: given allocation block group + * @bh: bufferhead contains the bitmap of the given block group + * * For ext3 allocations, we must not reuse any blocks which are * allocated in the bitmap buffer's "last committed data" copy. This * prevents deletes from freeing up the page for reuse until we have @@ -518,7 +632,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode, * data would allow the old block to be overwritten before the * transaction committed (because we force data to disk before commit). * This would lead to corruption if we crashed between overwriting the - * data and committing the delete. + * data and committing the delete. * * @@@ We may want to make this allocation behaviour conditional on * data-writes at some point, and disable it for metadata allocations or @@ -541,6 +655,16 @@ static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh) return ret; } +/** + * bitmap_search_next_usable_block() + * @start: the starting block (group relative) of the search + * @bh: bufferhead contains the block group bitmap + * @maxblocks: the ending block (group relative) of the reservation + * + * The bitmap search --- search forward alternately through the actual + * bitmap on disk and the last-committed copy in journal, until we find a + * bit free in both bitmaps. + */ static ext3_grpblk_t bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, ext3_grpblk_t maxblocks) @@ -548,11 +672,6 @@ bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, ext3_grpblk_t next; struct journal_head *jh = bh2jh(bh); - /* - * The bitmap search --- search forward alternately through the actual - * bitmap and the last-committed copy until we find a bit free in - * both - */ while (start < maxblocks) { next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start); if (next >= maxblocks) @@ -562,14 +681,20 @@ bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, jbd_lock_bh_state(bh); if (jh->b_committed_data) start = ext3_find_next_zero_bit(jh->b_committed_data, - maxblocks, next); + maxblocks, next); jbd_unlock_bh_state(bh); } return -1; } -/* - * Find an allocatable block in a bitmap. We honour both the bitmap and +/** + * find_next_usable_block() + * @start: the starting block (group relative) to find next + * allocatable block in bitmap. + * @bh: bufferhead contains the block group bitmap + * @maxblocks: the ending block (group relative) for the search + * + * Find an allocatable block in a bitmap. We honor both the bitmap and * its last-committed copy (if that exists), and perform the "most * appropriate allocation" algorithm of looking for a free block near * the initial goal; then for a free byte somewhere in the bitmap; then @@ -584,7 +709,7 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, if (start > 0) { /* - * The goal was occupied; search forward for a free + * The goal was occupied; search forward for a free * block within the next XX blocks. * * end_goal is more or less random, but it has to be @@ -620,7 +745,11 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh, return here; } -/* +/** + * claim_block() + * @block: the free block (group relative) to allocate + * @bh: the bufferhead containts the block group bitmap + * * We think we can allocate this block in this bitmap. Try to set the bit. * If that succeeds then check that nobody has allocated and then freed the * block since we saw that is was not marked in b_committed_data. If it _was_ @@ -646,7 +775,26 @@ claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh) return ret; } -/* +/** + * ext3_try_to_allocate() + * @sb: superblock + * @handle: handle to this transaction + * @group: given allocation block group + * @bitmap_bh: bufferhead holds the block bitmap + * @grp_goal: given target block within the group + * @count: target number of blocks to allocate + * @my_rsv: reservation window + * + * Attempt to allocate blocks within a give range. Set the range of allocation + * first, then find the first free bit(s) from the bitmap (within the range), + * and at last, allocate the blocks by claiming the found free bit as allocated. + * + * To set the range of this allocation: + * if there is a reservation window, only try to allocate block(s) from the + * file's own reservation window; + * Otherwise, the allocation range starts from the give goal block, ends at + * the block group's last block. + * * If we failed to allocate the desired block then we may end up crossing to a * new bitmap. In that case we must release write access to the old one via * ext3_journal_release_buffer(), else we'll run out of credits. @@ -703,7 +851,8 @@ repeat: } start = grp_goal; - if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) { + if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), + grp_goal, bitmap_bh)) { /* * The block was allocated by another thread, or it was * allocated and then freed by another thread @@ -718,7 +867,8 @@ repeat: grp_goal++; while (num < *count && grp_goal < end && ext3_test_allocatable(grp_goal, bitmap_bh) - && claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) { + && claim_block(sb_bgl_lock(EXT3_SB(sb), group), + grp_goal, bitmap_bh)) { num++; grp_goal++; } @@ -730,12 +880,12 @@ fail_access: } /** - * find_next_reservable_window(): + * find_next_reservable_window(): * find a reservable space within the given range. * It does not allocate the reservation window for now: * alloc_new_reservation() will do the work later. * - * @search_head: the head of the searching list; + * @search_head: the head of the searching list; * This is not necessarily the list head of the whole filesystem * * We have both head and start_block to assist the search @@ -743,12 +893,12 @@ fail_access: * but we will shift to the place where start_block is, * then start from there, when looking for a reservable space. * - * @size: the target new reservation window size + * @size: the target new reservation window size * - * @group_first_block: the first block we consider to start + * @group_first_block: the first block we consider to start * the real search from * - * @last_block: + * @last_block: * the maximum block number that our goal reservable space * could start from. This is normally the last block in this * group. The search will end when we found the start of next @@ -756,10 +906,10 @@ fail_access: * This could handle the cross boundary reservation window * request. * - * basically we search from the given range, rather than the whole - * reservation double linked list, (start_block, last_block) - * to find a free region that is of my size and has not - * been reserved. + * basically we search from the given range, rather than the whole + * reservation double linked list, (start_block, last_block) + * to find a free region that is of my size and has not + * been reserved. * */ static int find_next_reservable_window( @@ -812,7 +962,7 @@ static int find_next_reservable_window( /* * Found a reserveable space big enough. We could * have a reservation across the group boundary here - */ + */ break; } } @@ -848,7 +998,7 @@ static int find_next_reservable_window( } /** - * alloc_new_reservation()--allocate a new reservation window + * alloc_new_reservation()--allocate a new reservation window * * To make a new reservation, we search part of the filesystem * reservation list (the list that inside the group). We try to @@ -897,7 +1047,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; group_first_block = ext3_group_first_block_no(sb, group); - group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1; + group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); if (grp_goal < 0) start_block = group_first_block; @@ -929,9 +1079,10 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, if ((my_rsv->rsv_alloc_hit > (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { /* - * if we previously allocation hit ration is greater than half - * we double the size of reservation window next time - * otherwise keep the same + * if the previously allocation hit ratio is + * greater than 1/2, then we double the size of + * the reservation window the next time, + * otherwise we keep the same size window */ size = size * 2; if (size > EXT3_MAX_RESERVE_BLOCKS) @@ -1010,6 +1161,23 @@ retry: goto retry; } +/** + * try_to_extend_reservation() + * @my_rsv: given reservation window + * @sb: super block + * @size: the delta to extend + * + * Attempt to expand the reservation window large enough to have + * required number of free blocks + * + * Since ext3_try_to_allocate() will always allocate blocks within + * the reservation window range, if the window size is too small, + * multiple blocks allocation has to stop at the end of the reservation + * window. To make this more efficient, given the total number of + * blocks needed and the current size of the window, we try to + * expand the reservation window size if necessary on a best-effort + * basis before ext3_new_blocks() tries to allocate blocks, + */ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv, struct super_block *sb, int size) { @@ -1035,7 +1203,17 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv, spin_unlock(rsv_lock); } -/* +/** + * ext3_try_to_allocate_with_rsv() + * @sb: superblock + * @handle: handle to this transaction + * @group: given allocation block group + * @bitmap_bh: bufferhead holds the block bitmap + * @grp_goal: given target block within the group + * @count: target number of blocks to allocate + * @my_rsv: reservation window + * @errp: pointer to store the error code + * * This is the main function used to allocate a new block and its reservation * window. * @@ -1051,9 +1229,7 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv, * reservation), and there are lots of free blocks, but they are all * being reserved. * - * We use a sorted double linked list for the per-filesystem reservation list. - * The insert, remove and find a free space(non-reserved) operations for the - * sorted double linked list should be fast. + * We use a red-black tree for the per-filesystem reservation list. * */ static ext3_grpblk_t @@ -1063,7 +1239,7 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, struct ext3_reserve_window_node * my_rsv, unsigned long *count, int *errp) { - ext3_fsblk_t group_first_block; + ext3_fsblk_t group_first_block, group_last_block; ext3_grpblk_t ret = 0; int fatal; unsigned long num = *count; @@ -1100,6 +1276,7 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, * first block is the block number of the first block in this group */ group_first_block = ext3_group_first_block_no(sb, group); + group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1); /* * Basically we will allocate a new block from inode's reservation @@ -1118,7 +1295,8 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, */ while (1) { if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || - !goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb)) { + !goal_in_my_reservation(&my_rsv->rsv_window, + grp_goal, group, sb)) { if (my_rsv->rsv_goal_size < *count) my_rsv->rsv_goal_size = *count; ret = alloc_new_reservation(my_rsv, grp_goal, sb, @@ -1126,17 +1304,21 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, if (ret < 0) break; /* failed */ - if (!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb)) + if (!goal_in_my_reservation(&my_rsv->rsv_window, + grp_goal, group, sb)) grp_goal = -1; - } else if (grp_goal > 0 && (my_rsv->rsv_end-grp_goal+1) < *count) + } else if (grp_goal > 0 && + (my_rsv->rsv_end-grp_goal+1) < *count) try_to_extend_reservation(my_rsv, sb, *count-my_rsv->rsv_end + grp_goal - 1); - if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) - || (my_rsv->rsv_end < group_first_block)) + if ((my_rsv->rsv_start > group_last_block) || + (my_rsv->rsv_end < group_first_block)) { + rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1); BUG(); - ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, grp_goal, - &num, &my_rsv->rsv_window); + } + ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, + grp_goal, &num, &my_rsv->rsv_window); if (ret >= 0) { my_rsv->rsv_alloc_hit += num; *count = num; @@ -1161,6 +1343,12 @@ out: return ret; } +/** + * ext3_has_free_blocks() + * @sbi: in-core super block structure. + * + * Check if filesystem has at least 1 free block available for allocation. + */ static int ext3_has_free_blocks(struct ext3_sb_info *sbi) { ext3_fsblk_t free_blocks, root_blocks; @@ -1175,11 +1363,17 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi) return 1; } -/* +/** + * ext3_should_retry_alloc() + * @sb: super block + * @retries number of attemps has been made + * * ext3_should_retry_alloc() is called when ENOSPC is returned, and if * it is profitable to retry the operation, this function will wait * for the current or commiting transaction to complete, and then * return TRUE. + * + * if the total number of retries exceed three times, return FALSE. */ int ext3_should_retry_alloc(struct super_block *sb, int *retries) { @@ -1191,13 +1385,19 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries) return journal_force_commit_nested(EXT3_SB(sb)->s_journal); } -/* - * ext3_new_block uses a goal block to assist allocation. If the goal is - * free, or there is a free block within 32 blocks of the goal, that block - * is allocated. Otherwise a forward search is made for a free block; within - * each block group the search first looks for an entire free byte in the block - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. +/** + * ext3_new_blocks() -- core block(s) allocation function + * @handle: handle to this transaction + * @inode: file inode + * @goal: given target block(filesystem wide) + * @count: target number of blocks to allocate + * @errp: error code + * + * ext3_new_blocks uses a goal block to assist allocation. It tries to + * allocate block(s) from the block group contains the goal block first. If that + * fails, it will try to allocate block(s) from other block groups without + * any specific goal block. + * */ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, ext3_fsblk_t goal, unsigned long *count, int *errp) @@ -1303,7 +1503,7 @@ retry_alloc: smp_rmb(); /* - * Now search the rest of the groups. We assume that + * Now search the rest of the groups. We assume that * i and gdp correctly point to the last group visited. */ for (bgi = 0; bgi < ngroups; bgi++) { @@ -1428,7 +1628,7 @@ allocated: spin_lock(sb_bgl_lock(sbi, group_no)); gdp->bg_free_blocks_count = - cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - num); + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num); spin_unlock(sb_bgl_lock(sbi, group_no)); percpu_counter_mod(&sbi->s_freeblocks_counter, -num); @@ -1471,6 +1671,12 @@ ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode, return ext3_new_blocks(handle, inode, goal, &count, errp); } +/** + * ext3_count_free_blocks() -- count filesystem free blocks + * @sb: superblock + * + * Adds up the number of free blocks from each block group. + */ ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb) { ext3_fsblk_t desc_count; diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c index ce4f82b9e528..b9176eed98d1 100644 --- a/fs/ext3/bitmap.c +++ b/fs/ext3/bitmap.c @@ -20,7 +20,7 @@ unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) unsigned int i; unsigned long sum = 0; - if (!map) + if (!map) return (0); for (i = 0; i < numchars; i++) sum += nibblemap[map->b_data[i] & 0xf] + diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index fbb0d4ed07d4..429acbb4e064 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -59,7 +59,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) return (ext3_filetype_table[filetype]); } - + int ext3_check_dir_entry (const char * function, struct inode * dir, struct ext3_dir_entry_2 * de, @@ -67,7 +67,7 @@ int ext3_check_dir_entry (const char * function, struct inode * dir, unsigned long offset) { const char * error_msg = NULL; - const int rlen = le16_to_cpu(de->rec_len); + const int rlen = le16_to_cpu(de->rec_len); if (rlen < EXT3_DIR_REC_LEN(1)) error_msg = "rec_len is smaller than minimal"; @@ -162,7 +162,7 @@ revalidate: * to make sure. */ if (filp->f_version != inode->i_version) { for (i = 0; i < sb->s_blocksize && i < offset; ) { - de = (struct ext3_dir_entry_2 *) + de = (struct ext3_dir_entry_2 *) (bh->b_data + i); /* It's too expensive to do a full * dirent test each time round this @@ -181,7 +181,7 @@ revalidate: filp->f_version = inode->i_version; } - while (!error && filp->f_pos < inode->i_size + while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); if (!ext3_check_dir_entry ("ext3_readdir", inode, de, @@ -229,7 +229,7 @@ out: /* * These functions convert from the major/minor hash to an f_pos * value. - * + * * Currently we only use major hash numer. This is unfortunate, but * on 32-bit machines, the same VFS interface is used for lseek and * llseek, so if we use the 64 bit offset, then the 32-bit versions of @@ -250,7 +250,7 @@ out: struct fname { __u32 hash; __u32 minor_hash; - struct rb_node rb_hash; + struct rb_node rb_hash; struct fname *next; __u32 inode; __u8 name_len; @@ -343,10 +343,9 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, /* Create and allocate the fname structure */ len = sizeof(struct fname) + dirent->name_len + 1; - new_fn = kmalloc(len, GFP_KERNEL); + new_fn = kzalloc(len, GFP_KERNEL); if (!new_fn) return -ENOMEM; - memset(new_fn, 0, len); new_fn->hash = hash; new_fn->minor_hash = minor_hash; new_fn->inode = le32_to_cpu(dirent->inode); @@ -410,7 +409,7 @@ static int call_filldir(struct file * filp, void * dirent, curr_pos = hash2pos(fname->hash, fname->minor_hash); while (fname) { error = filldir(dirent, fname->name, - fname->name_len, curr_pos, + fname->name_len, curr_pos, fname->inode, get_dtype(sb, fname->file_type)); if (error) { @@ -465,7 +464,7 @@ static int ext3_dx_readdir(struct file * filp, /* * Fill the rbtree if we have no more entries, * or the inode has changed since we last read in the - * cached entries. + * cached entries. */ if ((!info->curr_node) || (filp->f_version != inode->i_version)) { diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 1efefb630ea9..994efd189f4e 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -100,7 +100,7 @@ ext3_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t force_commit: err = ext3_force_commit(inode->i_sb); - if (err) + if (err) return err; return ret; } diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 49382a208e05..dd1fd3c0fc05 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -8,14 +8,14 @@ * Universite Pierre et Marie Curie (Paris VI) * from * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds - * + * * ext3fs fsync primitive * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 - * + * * Removed unnecessary code duplication for little endian machines - * and excessive __inline__s. + * and excessive __inline__s. * Andi Kleen, 1997 * * Major simplications and cleanup - we only need to do the metadata, because diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index 5a2d1235ead0..deeb27b5ba83 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -4,7 +4,7 @@ * Copyright (C) 2002 by Theodore Ts'o * * This file is released under the GPL v2. - * + * * This file may be redistributed under the terms of the GNU Public * License. */ @@ -80,11 +80,11 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) * Returns the hash of a filename. If len is 0 and name is NULL, then * this function can be used to test whether or not a hash version is * supported. - * + * * The seed is an 4 longword (32 bits) "secret" which can be used to * uniquify a hash. If the seed is all zero's, then some default seed * may be used. - * + * * A particular hash version specifies whether or not the seed is * represented, and whether or not the returned hash is 32 bits or 64 * bits. 32 bit hashes will return 0 for the minor hash. @@ -95,7 +95,7 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) __u32 minor_hash = 0; const char *p; int i; - __u32 in[8], buf[4]; + __u32 in[8], buf[4]; /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 36546ed36a14..e45dbd651736 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -202,7 +202,7 @@ error_return: static int find_group_dir(struct super_block *sb, struct inode *parent) { int ngroups = EXT3_SB(sb)->s_groups_count; - int freei, avefreei; + unsigned int freei, avefreei; struct ext3_group_desc *desc, *best_desc = NULL; struct buffer_head *bh; int group, best_group = -1; @@ -216,7 +216,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) continue; if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) continue; - if (!best_desc || + if (!best_desc || (le16_to_cpu(desc->bg_free_blocks_count) > le16_to_cpu(best_desc->bg_free_blocks_count))) { best_group = group; @@ -226,30 +226,30 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) return best_group; } -/* - * Orlov's allocator for directories. - * +/* + * Orlov's allocator for directories. + * * We always try to spread first-level directories. * - * If there are blockgroups with both free inodes and free blocks counts - * not worse than average we return one with smallest directory count. - * Otherwise we simply return a random group. - * - * For the rest rules look so: - * - * It's OK to put directory into a group unless - * it has too many directories already (max_dirs) or - * it has too few free inodes left (min_inodes) or - * it has too few free blocks left (min_blocks) or - * it's already running too large debt (max_debt). - * Parent's group is prefered, if it doesn't satisfy these - * conditions we search cyclically through the rest. If none - * of the groups look good we just look for a group with more - * free inodes than average (starting at parent's group). - * - * Debt is incremented each time we allocate a directory and decremented - * when we allocate an inode, within 0--255. - */ + * If there are blockgroups with both free inodes and free blocks counts + * not worse than average we return one with smallest directory count. + * Otherwise we simply return a random group. + * + * For the rest rules look so: + * + * It's OK to put directory into a group unless + * it has too many directories already (max_dirs) or + * it has too few free inodes left (min_inodes) or + * it has too few free blocks left (min_blocks) or + * it's already running too large debt (max_debt). + * Parent's group is prefered, if it doesn't satisfy these + * conditions we search cyclically through the rest. If none + * of the groups look good we just look for a group with more + * free inodes than average (starting at parent's group). + * + * Debt is incremented each time we allocate a directory and decremented + * when we allocate an inode, within 0--255. + */ #define INODE_COST 64 #define BLOCK_COST 256 @@ -261,10 +261,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) struct ext3_super_block *es = sbi->s_es; int ngroups = sbi->s_groups_count; int inodes_per_group = EXT3_INODES_PER_GROUP(sb); - int freei, avefreei; + unsigned int freei, avefreei; ext3_fsblk_t freeb, avefreeb; ext3_fsblk_t blocks_per_dir; - int ndirs; + unsigned int ndirs; int max_debt, max_dirs, min_inodes; ext3_grpblk_t min_blocks; int group = -1, i; @@ -454,7 +454,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) group = find_group_dir(sb, dir); else group = find_group_orlov(sb, dir); - } else + } else group = find_group_other(sb, dir); err = -ENOSPC; @@ -559,7 +559,6 @@ got: inode->i_ino = ino; /* This is the optimal IO size (for stat), not the fs block size */ - inode->i_blksize = PAGE_SIZE; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 84be02e93652..dcf4f1dd108b 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -13,11 +13,11 @@ * Copyright (C) 1991, 1992 Linus Torvalds * * Goal-directed block allocation by Stephen Tweedie - * (sct@redhat.com), 1993, 1998 + * (sct@redhat.com), 1993, 1998 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * 64-bit file support on 64-bit platforms by Jakub Jelinek - * (jj@sunsite.ms.mff.cuni.cz) + * (jj@sunsite.ms.mff.cuni.cz) * * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 */ @@ -55,7 +55,7 @@ static int ext3_inode_is_fast_symlink(struct inode *inode) /* * The ext3 forget function must perform a revoke if we are freeing data * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. + * revoked in all cases. * * "bh" may be NULL: a metadata block may have been freed from memory * but there may still be a record of it in the journal, and that record @@ -105,7 +105,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. */ -static unsigned long blocks_for_truncate(struct inode *inode) +static unsigned long blocks_for_truncate(struct inode *inode) { unsigned long needed; @@ -122,13 +122,13 @@ static unsigned long blocks_for_truncate(struct inode *inode) /* But we need to bound the transaction so we don't overflow the * journal. */ - if (needed > EXT3_MAX_TRANS_DATA) + if (needed > EXT3_MAX_TRANS_DATA) needed = EXT3_MAX_TRANS_DATA; return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed; } -/* +/* * Truncate transactions can be complex and absolutely huge. So we need to * be able to restart the transaction at a conventient checkpoint to make * sure we don't overflow the journal. @@ -136,9 +136,9 @@ static unsigned long blocks_for_truncate(struct inode *inode) * start_transaction gets us a new handle for a truncate transaction, * and extend_transaction tries to extend the existing one a bit. If * extend fails, we need to propagate the failure up and restart the - * transaction in the top-level truncate loop. --sct + * transaction in the top-level truncate loop. --sct */ -static handle_t *start_transaction(struct inode *inode) +static handle_t *start_transaction(struct inode *inode) { handle_t *result; @@ -215,12 +215,12 @@ void ext3_delete_inode (struct inode * inode) ext3_orphan_del(handle, inode); EXT3_I(inode)->i_dtime = get_seconds(); - /* + /* * One subtle ordering requirement: if anything has gone wrong * (transaction abort, IO errors, whatever), then we can still * do these next steps (the fs will already have been marked as * having errors), but we can't free the inode if the mark_dirty - * fails. + * fails. */ if (ext3_mark_inode_dirty(handle, inode)) /* If that failed, just do the required in-core inode clear. */ @@ -398,7 +398,7 @@ no_block: * + if there is a block to the left of our position - allocate near it. * + if pointer will live in indirect block - allocate near that block. * + if pointer will live in inode - allocate in the same - * cylinder group. + * cylinder group. * * In the latter case we colour the starting block by the callers PID to * prevent it from clashing with concurrent allocations for a different inode @@ -470,7 +470,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block, * ext3_blks_to_allocate: Look up the block map and count the number * of direct blocks need to be allocated for the given branch. * - * @branch: chain of indirect blocks + * @branch: chain of indirect blocks * @k: number of blocks need for indirect blocks * @blks: number of data blocks to be mapped. * @blocks_to_boundary: the offset in the indirect block @@ -744,7 +744,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, jbd_debug(5, "splicing indirect only\n"); BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, where->bh); - if (err) + if (err) goto err_out; } else { /* @@ -1098,7 +1098,7 @@ static int walk_page_buffers( handle_t *handle, for ( bh = head, block_start = 0; ret == 0 && (bh != head || !block_start); - block_start = block_end, bh = next) + block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; @@ -1137,7 +1137,7 @@ static int walk_page_buffers( handle_t *handle, * So what we do is to rely on the fact that journal_stop/journal_start * will _not_ run commit under these circumstances because handle->h_ref * is elevated. We'll still have enough credits for the tiny quotafile - * write. + * write. */ static int do_journal_get_write_access(handle_t *handle, struct buffer_head *bh) @@ -1282,7 +1282,7 @@ static int ext3_journalled_commit_write(struct file *file, if (inode->i_size > EXT3_I(inode)->i_disksize) { EXT3_I(inode)->i_disksize = inode->i_size; ret2 = ext3_mark_inode_dirty(handle, inode); - if (!ret) + if (!ret) ret = ret2; } ret2 = ext3_journal_stop(handle); @@ -1291,7 +1291,7 @@ static int ext3_journalled_commit_write(struct file *file, return ret; } -/* +/* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * @@ -1300,10 +1300,10 @@ static int ext3_journalled_commit_write(struct file *file, * filesystem and enables swap, then they may get a nasty shock when the * data getting swapped to that swapfile suddenly gets overwritten by * the original zero's written out previously to the journal and - * awaiting writeback in the kernel's buffer cache. + * awaiting writeback in the kernel's buffer cache. * * So, if we see any bmap calls here on a modified, data-journaled file, - * take extra steps to flush any blocks which might be in the cache. + * take extra steps to flush any blocks which might be in the cache. */ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) { @@ -1312,16 +1312,16 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block) int err; if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) { - /* + /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: * only if we run lilo or swapon on a freshly made file - * do we expect this to happen. + * do we expect this to happen. * * (bmap requires CAP_SYS_RAWIO so this does not * represent an unprivileged user DOS attack --- we'd be * in trouble if mortal users could trigger this path at - * will.) + * will.) * * NB. EXT3_STATE_JDATA is not set on files other than * regular files. If somebody wants to bmap a directory @@ -1457,7 +1457,7 @@ static int ext3_ordered_writepage(struct page *page, */ /* - * And attach them to the current transaction. But only if + * And attach them to the current transaction. But only if * block_write_full_page() succeeded. Otherwise they are unmapped, * and generally junk. */ @@ -1644,7 +1644,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, } } - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext3_get_block, NULL); @@ -2025,7 +2025,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, __le32 *first, __le32 *last) { ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */ - unsigned long count = 0; /* Number of blocks in the run */ + unsigned long count = 0; /* Number of blocks in the run */ __le32 *block_to_free_p = NULL; /* Pointer into inode/ind corresponding to block_to_free */ @@ -2054,7 +2054,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode, } else if (nr == block_to_free + count) { count++; } else { - ext3_clear_blocks(handle, inode, this_bh, + ext3_clear_blocks(handle, inode, this_bh, block_to_free, count, block_to_free_p, p); block_to_free = nr; @@ -2115,7 +2115,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, */ if (!bh) { ext3_error(inode->i_sb, "ext3_free_branches", - "Read failure, inode=%ld, block="E3FSBLK, + "Read failure, inode=%lu, block="E3FSBLK, inode->i_ino, nr); continue; } @@ -2184,7 +2184,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, *p = 0; BUFFER_TRACE(parent_bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, + ext3_journal_dirty_metadata(handle, parent_bh); } } @@ -2632,9 +2632,6 @@ void ext3_read_inode(struct inode * inode) * recovery code: that's fine, we're about to complete * the process of deleting those. */ } - inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size - * (for stat), not the fs block - * size */ inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); ei->i_flags = le32_to_cpu(raw_inode->i_flags); #ifdef EXT3_FRAGMENTS @@ -2704,7 +2701,7 @@ void ext3_read_inode(struct inode * inode) if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); - else + else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } @@ -2724,8 +2721,8 @@ bad_inode: * * The caller must have write access to iloc->bh. */ -static int ext3_do_update_inode(handle_t *handle, - struct inode *inode, +static int ext3_do_update_inode(handle_t *handle, + struct inode *inode, struct ext3_iloc *iloc) { struct ext3_inode *raw_inode = ext3_raw_inode(iloc); @@ -2900,7 +2897,7 @@ int ext3_write_inode(struct inode *inode, int wait) * commit will leave the blocks being flushed in an unused state on * disk. (On recovery, the inode will get truncated and the blocks will * be freed, so we have a strong guarantee that no future commit will - * leave these blocks visible to the user.) + * leave these blocks visible to the user.) * * Called with inode->sem down. */ @@ -3043,13 +3040,13 @@ int ext3_mark_iloc_dirty(handle_t *handle, return err; } -/* +/* * On success, We end up with an outstanding reference count against - * iloc->bh. This _must_ be cleaned up later. + * iloc->bh. This _must_ be cleaned up later. */ int -ext3_reserve_inode_write(handle_t *handle, struct inode *inode, +ext3_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext3_iloc *iloc) { int err = 0; @@ -3139,7 +3136,7 @@ out: } #if 0 -/* +/* * Bind an inode's backing buffer_head into this transaction, to prevent * it from being flushed to disk early. Unlike * ext3_reserve_inode_write, this leaves behind no bh reference and @@ -3157,7 +3154,7 @@ static int ext3_pin_inode(handle_t *handle, struct inode *inode) BUFFER_TRACE(iloc.bh, "get_write_access"); err = journal_get_write_access(handle, iloc.bh); if (!err) - err = ext3_journal_dirty_metadata(handle, + err = ext3_journal_dirty_metadata(handle, iloc.bh); brelse(iloc.bh); } diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 2aa7101b27cd..85d132c37ee0 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -15,13 +15,13 @@ * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * Directory entry file type support and forward compatibility hooks - * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 * Hash Tree Directory indexing (c) - * Daniel Phillips, 2001 + * Daniel Phillips, 2001 * Hash Tree Directory indexing porting - * Christopher Li, 2002 + * Christopher Li, 2002 * Hash Tree Directory indexing cleanup - * Theodore Ts'o, 2002 + * Theodore Ts'o, 2002 */ #include <linux/fs.h> @@ -76,7 +76,7 @@ static struct buffer_head *ext3_append(handle_t *handle, #ifdef DX_DEBUG #define dxtrace(command) command #else -#define dxtrace(command) +#define dxtrace(command) #endif struct fake_dirent @@ -169,7 +169,7 @@ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); static int ext3_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, - struct dx_frame *frames, + struct dx_frame *frames, __u32 *start_hash); static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, struct ext3_dir_entry_2 **res_dir, int *err); @@ -250,7 +250,7 @@ static void dx_show_index (char * label, struct dx_entry *entries) } struct stats -{ +{ unsigned names; unsigned space; unsigned bcount; @@ -278,7 +278,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent ((char *) de - base)); } space += EXT3_DIR_REC_LEN(de->name_len); - names++; + names++; } de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); } @@ -464,7 +464,7 @@ static void dx_release (struct dx_frame *frames) */ static int ext3_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, - struct dx_frame *frames, + struct dx_frame *frames, __u32 *start_hash) { struct dx_frame *p; @@ -632,7 +632,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, } count += ret; hashval = ~0; - ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, + ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, frame, frames, &hashval); *next_hash = hashval; if (ret < 0) { @@ -649,7 +649,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, break; } dx_release(frames); - dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", + dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", count, *next_hash)); return count; errout: @@ -1050,7 +1050,7 @@ struct dentry *ext3_get_parent(struct dentry *child) parent = ERR_PTR(-ENOMEM); } return parent; -} +} #define S_SHIFT 12 static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { @@ -1198,7 +1198,7 @@ errout: * add_dirent_to_buf will attempt search the directory block for * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. - * + * * NOTE! bh is NOT released in the case where ENOSPC is returned. In * all other cases bh is released. */ @@ -1572,7 +1572,7 @@ cleanup: * ext3_delete_entry deletes a directory entry by merging it with the * previous entry */ -static int ext3_delete_entry (handle_t *handle, +static int ext3_delete_entry (handle_t *handle, struct inode * dir, struct ext3_dir_entry_2 * de_del, struct buffer_head * bh) @@ -1643,12 +1643,12 @@ static int ext3_add_nondir(handle_t *handle, * is so far negative - it has no inode. * * If the create succeeds, we fill in the inode information - * with d_instantiate(). + * with d_instantiate(). */ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd) { - handle_t *handle; + handle_t *handle; struct inode * inode; int err, retries = 0; @@ -1688,7 +1688,7 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1813,10 +1813,10 @@ static int empty_dir (struct inode * inode) de1 = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); if (le32_to_cpu(de->inode) != inode->i_ino || - !le32_to_cpu(de1->inode) || + !le32_to_cpu(de1->inode) || strcmp (".", de->name) || strcmp ("..", de1->name)) { - ext3_warning (inode->i_sb, "empty_dir", + ext3_warning (inode->i_sb, "empty_dir", "bad directory (dir #%lu) - no `.' or `..'", inode->i_ino); brelse (bh); @@ -1883,7 +1883,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) * being truncated, or files being unlinked. */ /* @@@ FIXME: Observation from aviro: - * I think I can trigger J_ASSERT in ext3_orphan_add(). We block + * I think I can trigger J_ASSERT in ext3_orphan_add(). We block * here (on lock_super()), so race with ext3_link() which might bump * ->i_nlink. For, say it, character device. Not a regular file, * not a directory, not a symlink and ->i_nlink > 0. @@ -1919,8 +1919,8 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) if (!err) list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); - jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); - jbd_debug(4, "orphan inode %ld will point to %d\n", + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); + jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out_unlock: unlock_super(sb); @@ -2129,7 +2129,7 @@ static int ext3_symlink (struct inode * dir, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2227,7 +2227,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, DQUOT_INIT(new_dentry->d_inode); handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2393,4 +2393,4 @@ struct inode_operations ext3_special_inode_operations = { .removexattr = generic_removexattr, #endif .permission = ext3_permission, -}; +}; diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 5e1337fd878a..b73cba12f79c 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -336,7 +336,7 @@ static int verify_reserved_gdb(struct super_block *sb, unsigned five = 5; unsigned seven = 7; unsigned grp; - __u32 *p = (__u32 *)primary->b_data; + __le32 *p = (__le32 *)primary->b_data; int gdbackups = 0; while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { @@ -380,7 +380,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, struct buffer_head *dind; int gdbackups; struct ext3_iloc iloc; - __u32 *data; + __le32 *data; int err; if (test_opt(sb, DEBUG)) @@ -417,7 +417,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, goto exit_bh; } - data = (__u32 *)dind->b_data; + data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { ext3_warning(sb, __FUNCTION__, "new group %u GDT block "E3FSBLK" not reserved", @@ -439,8 +439,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) goto exit_dindj; - n_group_desc = (struct buffer_head **)kmalloc((gdb_num + 1) * - sizeof(struct buffer_head *), GFP_KERNEL); + n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), + GFP_KERNEL); if (!n_group_desc) { err = -ENOMEM; ext3_warning (sb, __FUNCTION__, @@ -519,7 +519,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, struct buffer_head *dind; struct ext3_iloc iloc; ext3_fsblk_t blk; - __u32 *data, *end; + __le32 *data, *end; int gdbackups = 0; int res, i; int err; @@ -536,8 +536,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, } blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count; - data = (__u32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count; - end = (__u32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb); + data = (__le32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count; + end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb); /* Get each reserved primary GDT block and verify it holds backups */ for (res = 0; res < reserved_gdb; res++, blk++) { @@ -545,7 +545,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, ext3_warning(sb, __FUNCTION__, "reserved block "E3FSBLK " not at offset %ld", - blk, (long)(data - (__u32 *)dind->b_data)); + blk, + (long)(data - (__le32 *)dind->b_data)); err = -EINVAL; goto exit_bh; } @@ -560,7 +561,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, goto exit_bh; } if (++data >= end) - data = (__u32 *)dind->b_data; + data = (__le32 *)dind->b_data; } for (i = 0; i < reserved_gdb; i++) { @@ -584,7 +585,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, blk = input->group * EXT3_BLOCKS_PER_GROUP(sb); for (i = 0; i < reserved_gdb; i++) { int err2; - data = (__u32 *)primary[i]->b_data; + data = (__le32 *)primary[i]->b_data; /* printk("reserving backup %lu[%u] = %lu\n", primary[i]->b_blocknr, gdbackups, blk + primary[i]->b_blocknr); */ @@ -689,7 +690,7 @@ exit_err: "can't update backup for group %d (err %d), " "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT3_VALID_FS; - sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS); + sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS); mark_buffer_dirty(sbi->s_sbh); } } @@ -730,6 +731,18 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) return -EPERM; } + if (le32_to_cpu(es->s_blocks_count) + input->blocks_count < + le32_to_cpu(es->s_blocks_count)) { + ext3_warning(sb, __FUNCTION__, "blocks_count overflow\n"); + return -EINVAL; + } + + if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) < + le32_to_cpu(es->s_inodes_count)) { + ext3_warning(sb, __FUNCTION__, "inodes_count overflow\n"); + return -EINVAL; + } + if (reserved_gdb || gdb_off == 0) { if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_RESIZE_INODE)){ @@ -958,6 +971,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, add = EXT3_BLOCKS_PER_GROUP(sb) - last; + if (o_blocks_count + add < o_blocks_count) { + ext3_warning(sb, __FUNCTION__, "blocks_count overflow"); + return -EINVAL; + } + if (o_blocks_count + add > n_blocks_count) add = n_blocks_count - o_blocks_count; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 3559086eee5f..8bfd56ef18ca 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -45,7 +45,7 @@ static int ext3_load_journal(struct super_block *, struct ext3_super_block *, unsigned long journal_devnum); static int ext3_create_journal(struct super_block *, struct ext3_super_block *, - int); + unsigned int); static void ext3_commit_super (struct super_block * sb, struct ext3_super_block * es, int sync); @@ -62,13 +62,13 @@ static void ext3_unlockfs(struct super_block *sb); static void ext3_write_super (struct super_block * sb); static void ext3_write_super_lockfs(struct super_block *sb); -/* +/* * Wrappers for journal_start/end. * * The only special thing we need to do here is to make sure that all * journal_end calls result in the superblock being marked dirty, so * that sync() will call the filesystem's write_super callback if - * appropriate. + * appropriate. */ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) { @@ -90,11 +90,11 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) return journal_start(journal, nblocks); } -/* +/* * The only special thing we need to do here is to make sure that all * journal_stop calls result in the superblock being marked dirty, so * that sync() will call the filesystem's write_super callback if - * appropriate. + * appropriate. */ int __ext3_journal_stop(const char *where, handle_t *handle) { @@ -159,20 +159,21 @@ static void ext3_handle_error(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return; - if (test_opt (sb, ERRORS_RO)) { - printk (KERN_CRIT "Remounting filesystem read-only\n"); - sb->s_flags |= MS_RDONLY; - } else { + if (!test_opt (sb, ERRORS_CONT)) { journal_t *journal = EXT3_SB(sb)->s_journal; EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; if (journal) journal_abort(journal, -EIO); } + if (test_opt (sb, ERRORS_RO)) { + printk (KERN_CRIT "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + } + ext3_commit_super(sb, es, 1); if (test_opt(sb, ERRORS_PANIC)) panic("EXT3-fs (device %s): panic forced after error\n", sb->s_id); - ext3_commit_super(sb, es, 1); } void ext3_error (struct super_block * sb, const char * function, @@ -369,16 +370,16 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) { struct list_head *l; - printk(KERN_ERR "sb orphan head is %d\n", + printk(KERN_ERR "sb orphan head is %d\n", le32_to_cpu(sbi->s_es->s_last_orphan)); printk(KERN_ERR "sb_info orphan list:\n"); list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); printk(KERN_ERR " " - "inode %s:%ld at %p: mode %o, nlink %d, next %d\n", + "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, - inode->i_mode, inode->i_nlink, + inode->i_mode, inode->i_nlink, NEXT_ORPHAN(inode)); } } @@ -475,7 +476,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) inode_init_once(&ei->vfs_inode); } } - + static int init_inodecache(void) { ext3_inode_cachep = kmem_cache_create("ext3_inode_cache", @@ -490,8 +491,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(ext3_inode_cachep)) - printk(KERN_INFO "ext3_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(ext3_inode_cachep); } static void ext3_clear_inode(struct inode *inode) @@ -733,8 +733,8 @@ static match_table_t tokens = { static ext3_fsblk_t get_sb_block(void **data) { - ext3_fsblk_t sb_block; - char *options = (char *) *data; + ext3_fsblk_t sb_block; + char *options = (char *) *data; if (!options || strncmp(options, "sb=", 3) != 0) return 1; /* Default location */ @@ -753,7 +753,7 @@ static ext3_fsblk_t get_sb_block(void **data) } static int parse_options (char *options, struct super_block *sb, - unsigned long *inum, unsigned long *journal_devnum, + unsigned int *inum, unsigned long *journal_devnum, ext3_fsblk_t *n_blocks_count, int is_remount) { struct ext3_sb_info *sbi = EXT3_SB(sb); @@ -1174,7 +1174,8 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, static int ext3_check_descriptors (struct super_block * sb) { struct ext3_sb_info *sbi = EXT3_SB(sb); - ext3_fsblk_t block = le32_to_cpu(sbi->s_es->s_first_data_block); + ext3_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); + ext3_fsblk_t last_block; struct ext3_group_desc * gdp = NULL; int desc_block = 0; int i; @@ -1183,12 +1184,17 @@ static int ext3_check_descriptors (struct super_block * sb) for (i = 0; i < sbi->s_groups_count; i++) { + if (i == sbi->s_groups_count - 1) + last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; + else + last_block = first_block + + (EXT3_BLOCKS_PER_GROUP(sb) - 1); + if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0) gdp = (struct ext3_group_desc *) sbi->s_group_desc[desc_block++]->b_data; - if (le32_to_cpu(gdp->bg_block_bitmap) < block || - le32_to_cpu(gdp->bg_block_bitmap) >= - block + EXT3_BLOCKS_PER_GROUP(sb)) + if (le32_to_cpu(gdp->bg_block_bitmap) < first_block || + le32_to_cpu(gdp->bg_block_bitmap) > last_block) { ext3_error (sb, "ext3_check_descriptors", "Block bitmap for group %d" @@ -1197,9 +1203,8 @@ static int ext3_check_descriptors (struct super_block * sb) le32_to_cpu(gdp->bg_block_bitmap)); return 0; } - if (le32_to_cpu(gdp->bg_inode_bitmap) < block || - le32_to_cpu(gdp->bg_inode_bitmap) >= - block + EXT3_BLOCKS_PER_GROUP(sb)) + if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block || + le32_to_cpu(gdp->bg_inode_bitmap) > last_block) { ext3_error (sb, "ext3_check_descriptors", "Inode bitmap for group %d" @@ -1208,9 +1213,9 @@ static int ext3_check_descriptors (struct super_block * sb) le32_to_cpu(gdp->bg_inode_bitmap)); return 0; } - if (le32_to_cpu(gdp->bg_inode_table) < block || - le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >= - block + EXT3_BLOCKS_PER_GROUP(sb)) + if (le32_to_cpu(gdp->bg_inode_table) < first_block || + le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group > + last_block) { ext3_error (sb, "ext3_check_descriptors", "Inode table for group %d" @@ -1219,7 +1224,7 @@ static int ext3_check_descriptors (struct super_block * sb) le32_to_cpu(gdp->bg_inode_table)); return 0; } - block += EXT3_BLOCKS_PER_GROUP(sb); + first_block += EXT3_BLOCKS_PER_GROUP(sb); gdp++; } @@ -1301,17 +1306,17 @@ static void ext3_orphan_cleanup (struct super_block * sb, DQUOT_INIT(inode); if (inode->i_nlink) { printk(KERN_DEBUG - "%s: truncating inode %ld to %Ld bytes\n", + "%s: truncating inode %lu to %Ld bytes\n", __FUNCTION__, inode->i_ino, inode->i_size); - jbd_debug(2, "truncating inode %ld to %Ld bytes\n", + jbd_debug(2, "truncating inode %lu to %Ld bytes\n", inode->i_ino, inode->i_size); ext3_truncate(inode); nr_truncates++; } else { printk(KERN_DEBUG - "%s: deleting unreferenced inode %ld\n", + "%s: deleting unreferenced inode %lu\n", __FUNCTION__, inode->i_ino); - jbd_debug(2, "deleting unreferenced inode %ld\n", + jbd_debug(2, "deleting unreferenced inode %lu\n", inode->i_ino); nr_orphans++; } @@ -1390,7 +1395,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) ext3_fsblk_t sb_block = get_sb_block(&data); ext3_fsblk_t logic_sb_block; unsigned long offset = 0; - unsigned long journal_inum = 0; + unsigned int journal_inum = 0; unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; @@ -1401,11 +1406,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) int needs_recovery; __le32 features; - sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; - memset(sbi, 0, sizeof(*sbi)); sbi->s_mount_opt = 0; sbi->s_resuid = EXT3_DEF_RESUID; sbi->s_resgid = EXT3_DEF_RESGID; @@ -1483,7 +1487,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) - printk(KERN_WARNING + printk(KERN_WARNING "EXT3-fs warning: feature flags set on rev 0 fs, " "running e2fsck is recommended\n"); /* @@ -1509,7 +1513,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) if (blocksize < EXT3_MIN_BLOCK_SIZE || blocksize > EXT3_MAX_BLOCK_SIZE) { - printk(KERN_ERR + printk(KERN_ERR "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n", blocksize, sb->s_id); goto failed_mount; @@ -1533,14 +1537,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; bh = sb_bread(sb, logic_sb_block); if (!bh) { - printk(KERN_ERR + printk(KERN_ERR "EXT3-fs: Can't read superblock on 2nd try.\n"); goto failed_mount; } es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) { - printk (KERN_ERR + printk (KERN_ERR "EXT3-fs: Magic mismatch, very weird !\n"); goto failed_mount; } @@ -1622,10 +1626,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) if (EXT3_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext3; - sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - - le32_to_cpu(es->s_first_data_block) + - EXT3_BLOCKS_PER_GROUP(sb) - 1) / - EXT3_BLOCKS_PER_GROUP(sb); + sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) - 1) + / EXT3_BLOCKS_PER_GROUP(sb)) + 1; db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / EXT3_DESC_PER_BLOCK(sb); sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), @@ -1820,7 +1823,7 @@ out_fail: /* * Setup any per-fs journal parameters now. We'll do this both on * initial mount, once the journal has been initialised but before we've - * done any recovery; and again on any subsequent remount. + * done any recovery; and again on any subsequent remount. */ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) { @@ -1840,7 +1843,8 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) spin_unlock(&journal->j_state_lock); } -static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum) +static journal_t *ext3_get_journal(struct super_block *sb, + unsigned int journal_inum) { struct inode *journal_inode; journal_t *journal; @@ -1975,7 +1979,7 @@ static int ext3_load_journal(struct super_block *sb, unsigned long journal_devnum) { journal_t *journal; - int journal_inum = le32_to_cpu(es->s_journal_inum); + unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); dev_t journal_dev; int err = 0; int really_read_only; @@ -2061,7 +2065,7 @@ static int ext3_load_journal(struct super_block *sb, static int ext3_create_journal(struct super_block * sb, struct ext3_super_block * es, - int journal_inum) + unsigned int journal_inum) { journal_t *journal; @@ -2074,7 +2078,7 @@ static int ext3_create_journal(struct super_block * sb, if (!(journal = ext3_get_journal(sb, journal_inum))) return -EINVAL; - printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n", + printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n", journal_inum); if (journal_create(journal)) { @@ -2342,10 +2346,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) */ ext3_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); - if ((ret = ext3_group_extend(sb, es, n_blocks_count))) { - err = ret; + if ((err = ext3_group_extend(sb, es, n_blocks_count))) goto restore_opts; - } if (!ext3_setup_super (sb, es, 0)) sb->s_flags &= ~MS_RDONLY; } @@ -2734,7 +2736,7 @@ static int __init init_ext3_fs(void) out: destroy_inodecache(); out1: - exit_ext3_xattr(); + exit_ext3_xattr(); return err; } diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index a44a0562203a..f86f2482f01d 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -75,7 +75,7 @@ #ifdef EXT3_XATTR_DEBUG # define ea_idebug(inode, f...) do { \ - printk(KERN_DEBUG "inode %s:%ld: ", \ + printk(KERN_DEBUG "inode %s:%lu: ", \ inode->i_sb->s_id, inode->i_ino); \ printk(f); \ printk("\n"); \ @@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext3_xattr_check_block(bh)) { bad_block: ext3_error(inode->i_sb, __FUNCTION__, - "inode %ld: bad block "E3FSBLK, inode->i_ino, + "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); error = -EIO; goto cleanup; @@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext3_xattr_check_block(bh)) { ext3_error(inode->i_sb, __FUNCTION__, - "inode %ld: bad block "E3FSBLK, inode->i_ino, + "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); error = -EIO; goto cleanup; @@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i, le32_to_cpu(BHDR(bs->bh)->h_refcount)); if (ext3_xattr_check_block(bs->bh)) { ext3_error(sb, __FUNCTION__, - "inode %ld: bad block "E3FSBLK, inode->i_ino, + "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); error = -EIO; goto cleanup; @@ -848,7 +848,7 @@ cleanup_dquot: bad_block: ext3_error(inode->i_sb, __FUNCTION__, - "inode %ld: bad block "E3FSBLK, inode->i_ino, + "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); goto cleanup; @@ -1077,14 +1077,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); if (!bh) { ext3_error(inode->i_sb, __FUNCTION__, - "inode %ld: block "E3FSBLK" read error", inode->i_ino, + "inode %lu: block "E3FSBLK" read error", inode->i_ino, EXT3_I(inode)->i_file_acl); goto cleanup; } if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { ext3_error(inode->i_sb, __FUNCTION__, - "inode %ld: bad block "E3FSBLK, inode->i_ino, + "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); goto cleanup; } @@ -1211,7 +1211,7 @@ again: bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { ext3_error(inode->i_sb, __FUNCTION__, - "inode %ld: block %lu read error", + "inode %lu: block %lu read error", inode->i_ino, (unsigned long) ce->e_block); } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= EXT3_XATTR_REFCOUNT_MAX) { diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 97b967b84fc6..82cc4f59e3ba 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -58,8 +58,7 @@ int __init fat_cache_init(void) void fat_cache_destroy(void) { - if (kmem_cache_destroy(fat_cache_cachep)) - printk(KERN_INFO "fat_cache: not all structures were freed\n"); + kmem_cache_destroy(fat_cache_cachep); } static inline struct fat_cache *fat_cache_alloc(struct inode *inode) diff --git a/fs/fat/file.c b/fs/fat/file.c index 1ee25232e6af..d50fc47169c1 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -13,6 +13,7 @@ #include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/blkdev.h> int fat_generic_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) @@ -112,6 +113,16 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, } } +static int fat_file_release(struct inode *inode, struct file *filp) +{ + if ((filp->f_mode & FMODE_WRITE) && + MSDOS_SB(inode->i_sb)->options.flush) { + fat_flush_inodes(inode->i_sb, inode, NULL); + blk_congestion_wait(WRITE, HZ/10); + } + return 0; +} + const struct file_operations fat_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -121,6 +132,7 @@ const struct file_operations fat_file_operations = { .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, + .release = fat_file_release, .ioctl = fat_generic_ioctl, .fsync = file_fsync, .sendfile = generic_file_sendfile, @@ -289,6 +301,7 @@ void fat_truncate(struct inode *inode) lock_kernel(); fat_free(inode, nr_clusters); unlock_kernel(); + fat_flush_inodes(inode->i_sb, inode, NULL); } struct inode_operations fat_file_inode_operations = { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 31b7174176ba..045738032a83 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -24,6 +24,7 @@ #include <linux/vfs.h> #include <linux/parser.h> #include <linux/uio.h> +#include <linux/writeback.h> #include <asm/unaligned.h> #ifndef CONFIG_FAT_DEFAULT_IOCHARSET @@ -50,14 +51,14 @@ static int fat_add_cluster(struct inode *inode) return err; } -static int __fat_get_blocks(struct inode *inode, sector_t iblock, - unsigned long *max_blocks, - struct buffer_head *bh_result, int create) +static inline int __fat_get_block(struct inode *inode, sector_t iblock, + unsigned long *max_blocks, + struct buffer_head *bh_result, int create) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); - sector_t phys; unsigned long mapped_blocks; + sector_t phys; int err, offset; err = fat_bmap(inode, iblock, &phys, &mapped_blocks); @@ -73,7 +74,7 @@ static int __fat_get_blocks(struct inode *inode, sector_t iblock, if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) { fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)", - MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private); + MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private); return -EIO; } @@ -93,34 +94,29 @@ static int __fat_get_blocks(struct inode *inode, sector_t iblock, err = fat_bmap(inode, iblock, &phys, &mapped_blocks); if (err) return err; + BUG_ON(!phys); BUG_ON(*max_blocks != mapped_blocks); set_buffer_new(bh_result); map_bh(bh_result, sb, phys); + return 0; } -static int fat_get_blocks(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int fat_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { struct super_block *sb = inode->i_sb; - int err; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err; - err = __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create); + err = __fat_get_block(inode, iblock, &max_blocks, bh_result, create); if (err) return err; bh_result->b_size = max_blocks << sb->s_blocksize_bits; return 0; } -static int fat_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - unsigned long max_blocks = 1; - return __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create); -} - static int fat_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, fat_get_block, wbc); @@ -188,7 +184,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, * condition of fat_get_block() and ->truncate(). */ return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, fat_get_blocks, NULL); + offset, nr_segs, fat_get_block, NULL); } static sector_t _fat_bmap(struct address_space *mapping, sector_t block) @@ -375,8 +371,6 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_flags |= S_IMMUTABLE; } MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED; - /* this is as close to the truth as we can get ... */ - inode->i_blksize = sbi->cluster_size; inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; inode->i_mtime.tv_sec = @@ -528,8 +522,7 @@ static int __init fat_init_inodecache(void) static void __exit fat_destroy_inodecache(void) { - if (kmem_cache_destroy(fat_inode_cachep)) - printk(KERN_INFO "fat_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(fat_inode_cachep); } static int fat_remount(struct super_block *sb, int *flags, char *data) @@ -861,7 +854,7 @@ enum { Opt_charset, Opt_shortname_lower, Opt_shortname_win95, Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, - Opt_obsolate, Opt_err, + Opt_obsolate, Opt_flush, Opt_err, }; static match_table_t fat_tokens = { @@ -893,7 +886,8 @@ static match_table_t fat_tokens = { {Opt_obsolate, "cvf_format=%20s"}, {Opt_obsolate, "cvf_options=%100s"}, {Opt_obsolate, "posix"}, - {Opt_err, NULL} + {Opt_flush, "flush"}, + {Opt_err, NULL}, }; static match_table_t msdos_tokens = { {Opt_nodots, "nodots"}, @@ -1034,6 +1028,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, return 0; opts->codepage = option; break; + case Opt_flush: + opts->flush = 1; + break; /* msdos specific */ case Opt_dots: @@ -1137,7 +1134,6 @@ static int fat_read_root(struct inode *inode) MSDOS_I(inode)->i_start = 0; inode->i_size = sbi->dir_entries * sizeof(struct msdos_dir_entry); } - inode->i_blksize = sbi->cluster_size; inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; MSDOS_I(inode)->i_logstart = 0; @@ -1168,11 +1164,10 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, long error; char buf[50]; - sbi = kmalloc(sizeof(struct msdos_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; - memset(sbi, 0, sizeof(struct msdos_sb_info)); sb->s_flags |= MS_NODIRATIME; sb->s_magic = MSDOS_SUPER_MAGIC; @@ -1435,6 +1430,56 @@ out_fail: EXPORT_SYMBOL_GPL(fat_fill_super); +/* + * helper function for fat_flush_inodes. This writes both the inode + * and the file data blocks, waiting for in flight data blocks before + * the start of the call. It does not wait for any io started + * during the call + */ +static int writeback_inode(struct inode *inode) +{ + + int ret; + struct address_space *mapping = inode->i_mapping; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 0, + }; + /* if we used WB_SYNC_ALL, sync_inode waits for the io for the + * inode to finish. So WB_SYNC_NONE is sent down to sync_inode + * and filemap_fdatawrite is used for the data blocks + */ + ret = sync_inode(inode, &wbc); + if (!ret) + ret = filemap_fdatawrite(mapping); + return ret; +} + +/* + * write data and metadata corresponding to i1 and i2. The io is + * started but we do not wait for any of it to finish. + * + * filemap_flush is used for the block device, so if there is a dirty + * page for a block already in flight, we will not wait and start the + * io over again + */ +int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2) +{ + int ret = 0; + if (!MSDOS_SB(sb)->options.flush) + return 0; + if (i1) + ret = writeback_inode(i1); + if (!ret && i2) + ret = writeback_inode(i2); + if (!ret && sb) { + struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + ret = filemap_flush(mapping); + } + return ret; +} +EXPORT_SYMBOL_GPL(fat_flush_inodes); + static int __init init_fat_fs(void) { int err; diff --git a/fs/file.c b/fs/file.c index b3c6b82e6a9d..8e81775c5dc8 100644 --- a/fs/file.c +++ b/fs/file.c @@ -281,80 +281,70 @@ static struct fdtable *alloc_fdtable(int nr) out2: nfds = fdt->max_fdset; out: - if (new_openset) - free_fdset(new_openset, nfds); - if (new_execset) - free_fdset(new_execset, nfds); + free_fdset(new_openset, nfds); + free_fdset(new_execset, nfds); kfree(fdt); return NULL; } /* - * Expands the file descriptor table - it will allocate a new fdtable and - * both fd array and fdset. It is expected to be called with the - * files_lock held. + * Expand the file descriptor table. + * This function will allocate a new fdtable and both fd array and fdset, of + * the given size. + * Return <0 error code on error; 1 on successful completion. + * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, int nr) __releases(files->file_lock) __acquires(files->file_lock) { - int error = 0; - struct fdtable *fdt; - struct fdtable *nfdt = NULL; + struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); - nfdt = alloc_fdtable(nr); - if (!nfdt) { - error = -ENOMEM; - spin_lock(&files->file_lock); - goto out; - } - + new_fdt = alloc_fdtable(nr); spin_lock(&files->file_lock); - fdt = files_fdtable(files); + if (!new_fdt) + return -ENOMEM; /* - * Check again since another task may have expanded the - * fd table while we dropped the lock + * Check again since another task may have expanded the fd table while + * we dropped the lock */ - if (nr >= fdt->max_fds || nr >= fdt->max_fdset) { - copy_fdtable(nfdt, fdt); + cur_fdt = files_fdtable(files); + if (nr >= cur_fdt->max_fds || nr >= cur_fdt->max_fdset) { + /* Continue as planned */ + copy_fdtable(new_fdt, cur_fdt); + rcu_assign_pointer(files->fdt, new_fdt); + free_fdtable(cur_fdt); } else { - /* Somebody expanded while we dropped file_lock */ - spin_unlock(&files->file_lock); - __free_fdtable(nfdt); - spin_lock(&files->file_lock); - goto out; + /* Somebody else expanded, so undo our attempt */ + __free_fdtable(new_fdt); } - rcu_assign_pointer(files->fdt, nfdt); - free_fdtable(fdt); -out: - return error; + return 1; } /* * Expand files. - * Return <0 on error; 0 nothing done; 1 files expanded, we may have blocked. - * Should be called with the files->file_lock spinlock held for write. + * This function will expand the file structures, if the requested size exceeds + * the current capacity and there is room for expansion. + * Return <0 error code on error; 0 when nothing done; 1 when files were + * expanded and execution may have blocked. + * The files->file_lock should be held on entry, and will be held on exit. */ int expand_files(struct files_struct *files, int nr) { - int err, expand = 0; struct fdtable *fdt; fdt = files_fdtable(files); - if (nr >= fdt->max_fdset || nr >= fdt->max_fds) { - if (fdt->max_fdset >= NR_OPEN || - fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) { - err = -EMFILE; - goto out; - } - expand = 1; - if ((err = expand_fdtable(files, nr))) - goto out; - } - err = expand; -out: - return err; + /* Do we need to expand? */ + if (nr < fdt->max_fdset && nr < fdt->max_fds) + return 0; + /* Can we expand? */ + if (fdt->max_fdset >= NR_OPEN || fdt->max_fds >= NR_OPEN || + nr >= NR_OPEN) + return -EMFILE; + + /* All good, so we try */ + return expand_fdtable(files, nr); } static void __devinit fdtable_defer_list_init(int cpu) diff --git a/fs/file_table.c b/fs/file_table.c index 0131ba06e1ee..bc35a40417d7 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -169,7 +169,7 @@ void fastcall __fput(struct file *file) if (file->f_op && file->f_op->release) file->f_op->release(inode, file); security_file_free(file); - if (unlikely(inode->i_cdev != NULL)) + if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) cdev_put(inode->i_cdev); fops_put(file->f_op); if (file->f_mode & FMODE_WRITE) diff --git a/fs/filesystems.c b/fs/filesystems.c index 9f1072836c8e..e3fa77c6ed56 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -69,8 +69,6 @@ int register_filesystem(struct file_system_type * fs) int res = 0; struct file_system_type ** p; - if (!fs) - return -EINVAL; if (fs->next) return -EBUSY; INIT_LIST_HEAD(&fs->fs_supers); diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h index d35979a58743..c8a92652612a 100644 --- a/fs/freevxfs/vxfs.h +++ b/fs/freevxfs/vxfs.h @@ -252,7 +252,7 @@ enum { * Get filesystem private data from VFS inode. */ #define VXFS_INO(ip) \ - ((struct vxfs_inode_info *)(ip)->u.generic_ip) + ((struct vxfs_inode_info *)(ip)->i_private) /* * Get filesystem private data from VFS superblock. diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index ca6a39714771..4786d51ad3bd 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -239,11 +239,10 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip) ip->i_ctime.tv_nsec = 0; ip->i_mtime.tv_nsec = 0; - ip->i_blksize = PAGE_SIZE; ip->i_blocks = vip->vii_blocks; ip->i_generation = vip->vii_gen; - ip->u.generic_ip = (void *)vip; + ip->i_private = vip; } @@ -338,5 +337,5 @@ vxfs_read_inode(struct inode *ip) void vxfs_clear_inode(struct inode *ip) { - kmem_cache_free(vxfs_inode_cachep, ip->u.generic_ip); + kmem_cache_free(vxfs_inode_cachep, ip->i_private); } diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index b74b791fc23b..ac28b0835ffc 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -260,12 +260,17 @@ static struct file_system_type vxfs_fs_type = { static int __init vxfs_init(void) { + int rv; + vxfs_inode_cachep = kmem_cache_create("vxfs_inode", sizeof(struct vxfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL, NULL); - if (vxfs_inode_cachep) - return register_filesystem(&vxfs_fs_type); - return -ENOMEM; + if (!vxfs_inode_cachep) + return -ENOMEM; + rv = register_filesystem(&vxfs_fs_type); + if (rv < 0) + kmem_cache_destroy(vxfs_inode_cachep); + return rv; } static void __exit diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 46fe60b2da23..79ec1f23d4d2 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -23,7 +23,7 @@ static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) { struct fuse_conn *fc; mutex_lock(&fuse_mutex); - fc = file->f_dentry->d_inode->u.generic_ip; + fc = file->f_dentry->d_inode->i_private; if (fc) fc = fuse_conn_get(fc); mutex_unlock(&fuse_mutex); @@ -98,7 +98,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, inode->i_op = iop; inode->i_fop = fop; inode->i_nlink = nlink; - inode->u.generic_ip = fc; + inode->i_private = fc; d_add(dentry, inode); return dentry; } @@ -150,7 +150,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) for (i = fc->ctl_ndents - 1; i >= 0; i--) { struct dentry *dentry = fc->ctl_dentry[i]; - dentry->d_inode->u.generic_ip = NULL; + dentry->d_inode->i_private = NULL; d_drop(dentry); dput(dentry); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1e2006caf158..4fc557c40cc0 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -212,6 +212,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) + __releases(fc->lock) { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; @@ -640,6 +641,7 @@ static void request_wait(struct fuse_conn *fc) */ static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, const struct iovec *iov, unsigned long nr_segs) + __releases(fc->lock) { struct fuse_copy_state cs; struct fuse_in_header ih; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 409ce6a7cca4..f85b2a282f13 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -776,7 +776,7 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd) if ((mask & MAY_EXEC) && !S_ISDIR(mode) && !(mode & S_IXUGO)) return -EACCES; - if (nd && (nd->flags & LOOKUP_ACCESS)) + if (nd && (nd->flags & (LOOKUP_ACCESS | LOOKUP_CHDIR))) return fuse_access(inode, mask); return 0; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7d25092262ae..7d0a9aee01f2 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -118,7 +118,6 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr) inode->i_uid = attr->uid; inode->i_gid = attr->gid; i_size_write(inode, attr->size); - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = attr->blocks; inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; @@ -252,6 +251,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) memset(&outarg, 0, sizeof(outarg)); req->in.numargs = 0; req->in.h.opcode = FUSE_STATFS; + req->in.h.nodeid = get_node_id(dentry->d_inode); req->out.numargs = 1; req->out.args[0].size = fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); diff --git a/fs/generic_acl.c b/fs/generic_acl.c new file mode 100644 index 000000000000..9ccb78947171 --- /dev/null +++ b/fs/generic_acl.c @@ -0,0 +1,197 @@ +/* + * fs/generic_acl.c + * + * (C) 2005 Andreas Gruenbacher <agruen@suse.de> + * + * This file is released under the GPL. + */ + +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/generic_acl.h> + +/** + * generic_acl_list - Generic xattr_handler->list() operation + * @ops: Filesystem specific getacl and setacl callbacks + */ +size_t +generic_acl_list(struct inode *inode, struct generic_acl_operations *ops, + int type, char *list, size_t list_size) +{ + struct posix_acl *acl; + const char *name; + size_t size; + + acl = ops->getacl(inode, type); + if (!acl) + return 0; + posix_acl_release(acl); + + switch(type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + + default: + return 0; + } + size = strlen(name) + 1; + if (list && size <= list_size) + memcpy(list, name, size); + return size; +} + +/** + * generic_acl_get - Generic xattr_handler->get() operation + * @ops: Filesystem specific getacl and setacl callbacks + */ +int +generic_acl_get(struct inode *inode, struct generic_acl_operations *ops, + int type, void *buffer, size_t size) +{ + struct posix_acl *acl; + int error; + + acl = ops->getacl(inode, type); + if (!acl) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + +/** + * generic_acl_set - Generic xattr_handler->set() operation + * @ops: Filesystem specific getacl and setacl callbacks + */ +int +generic_acl_set(struct inode *inode, struct generic_acl_operations *ops, + int type, const void *value, size_t size) +{ + struct posix_acl *acl = NULL; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) + return -EPERM; + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (acl) { + mode_t mode; + + error = posix_acl_valid(acl); + if (error) + goto failed; + switch(type) { + case ACL_TYPE_ACCESS: + mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + goto failed; + inode->i_mode = mode; + if (error == 0) { + posix_acl_release(acl); + acl = NULL; + } + break; + + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + error = -EINVAL; + goto failed; + } + break; + } + } + ops->setacl(inode, type, acl); + error = 0; +failed: + posix_acl_release(acl); + return error; +} + +/** + * generic_acl_init - Take care of acl inheritance at @inode create time + * @ops: Filesystem specific getacl and setacl callbacks + * + * Files created inside a directory with a default ACL inherit the + * directory's default ACL. + */ +int +generic_acl_init(struct inode *inode, struct inode *dir, + struct generic_acl_operations *ops) +{ + struct posix_acl *acl = NULL; + mode_t mode = inode->i_mode; + int error; + + inode->i_mode = mode & ~current->fs->umask; + if (!S_ISLNK(inode->i_mode)) + acl = ops->getacl(dir, ACL_TYPE_DEFAULT); + if (acl) { + struct posix_acl *clone; + + if (S_ISDIR(inode->i_mode)) { + clone = posix_acl_clone(acl, GFP_KERNEL); + error = -ENOMEM; + if (!clone) + goto cleanup; + ops->setacl(inode, ACL_TYPE_DEFAULT, clone); + posix_acl_release(clone); + } + clone = posix_acl_clone(acl, GFP_KERNEL); + error = -ENOMEM; + if (!clone) + goto cleanup; + error = posix_acl_create_masq(clone, &mode); + if (error >= 0) { + inode->i_mode = mode; + if (error > 0) + ops->setacl(inode, ACL_TYPE_ACCESS, clone); + } + posix_acl_release(clone); + } + error = 0; + +cleanup: + posix_acl_release(acl); + return error; +} + +/** + * generic_acl_chmod - change the access acl of @inode upon chmod() + * @ops: FIlesystem specific getacl and setacl callbacks + * + * A chmod also changes the permissions of the owner, group/mask, and + * other ACL entries. + */ +int +generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops) +{ + struct posix_acl *acl, *clone; + int error = 0; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + acl = ops->getacl(inode, ACL_TYPE_ACCESS); + if (acl) { + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) + ops->setacl(inode, ACL_TYPE_ACCESS, clone); + posix_acl_release(clone); + } + return error; +} diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 13231dd5ce66..0d200068d0af 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -249,10 +249,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) sb = tree->inode->i_sb; size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * sizeof(struct page *); - node = kmalloc(size, GFP_KERNEL); + node = kzalloc(size, GFP_KERNEL); if (!node) return NULL; - memset(node, 0, size); node->tree = tree; node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 400357994319..5fd0ed71f923 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -21,10 +21,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke struct page *page; unsigned int size; - tree = kmalloc(sizeof(*tree), GFP_KERNEL); + tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) return NULL; - memset(tree, 0, sizeof(*tree)); init_MUTEX(&tree->tree_lock); spin_lock_init(&tree->hash_lock); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 315cf44a90b2..d05641c35fc9 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -154,7 +154,6 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode) inode->i_gid = current->fsgid; inode->i_nlink = 1; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; - inode->i_blksize = HFS_SB(sb)->alloc_blksz; HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; HFS_I(inode)->fs_blocks = 0; @@ -284,7 +283,6 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_uid = hsb->s_uid; inode->i_gid = hsb->s_gid; inode->i_nlink = 1; - inode->i_blksize = HFS_SB(inode->i_sb)->alloc_blksz; if (idata->key) HFS_I(inode)->cat_key = *idata->key; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 34937ee83ab1..d43b4fcc8ad3 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -356,11 +356,10 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) struct inode *root_inode; int res; - sbi = kmalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; - memset(sbi, 0, sizeof(struct hfs_sb_info)); INIT_HLIST_HEAD(&sbi->rsrc_inodes); res = -EINVAL; @@ -455,8 +454,7 @@ static int __init init_hfs_fs(void) static void __exit exit_hfs_fs(void) { unregister_filesystem(&hfs_fs_type); - if (kmem_cache_destroy(hfs_inode_cachep)) - printk(KERN_ERR "hfs_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(hfs_inode_cachep); } module_init(init_hfs_fs) diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 77bf434da679..29da6574ba77 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -409,10 +409,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) sb = tree->inode->i_sb; size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * sizeof(struct page *); - node = kmalloc(size, GFP_KERNEL); + node = kzalloc(size, GFP_KERNEL); if (!node) return NULL; - memset(node, 0, size); node->tree = tree; node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index cfc852fdd1b5..a9b9e872e29a 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -24,10 +24,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) struct page *page; unsigned int size; - tree = kmalloc(sizeof(*tree), GFP_KERNEL); + tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) return NULL; - memset(tree, 0, sizeof(*tree)); init_MUTEX(&tree->tree_lock); spin_lock_init(&tree->hash_lock); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 924ecdef8091..0eb1a6092668 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -304,7 +304,6 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode) inode->i_gid = current->fsgid; inode->i_nlink = 1; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; - inode->i_blksize = HFSPLUS_SB(sb).alloc_blksz; INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); init_MUTEX(&HFSPLUS_I(inode).extents_lock); atomic_set(&HFSPLUS_I(inode).opencnt, 0); @@ -407,7 +406,6 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); HFSPLUS_I(inode).dev = 0; - inode->i_blksize = HFSPLUS_SB(inode->i_sb).alloc_blksz; if (type == HFSPLUS_FOLDER) { struct hfsplus_cat_folder *folder = &entry.folder; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index d279d5924f28..194eede52fa4 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -493,8 +493,7 @@ static int __init init_hfsplus_fs(void) static void __exit exit_hfsplus_fs(void) { unregister_filesystem(&hfsplus_fs_type); - if (kmem_cache_destroy(hfsplus_inode_cachep)) - printk(KERN_ERR "hfsplus_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(hfsplus_inode_cachep); } module_init(init_hfsplus_fs) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index b82e3d9c8790..322e876c35ed 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -156,7 +156,6 @@ static int read_name(struct inode *ino, char *name) ino->i_mode = i_mode; ino->i_nlink = i_nlink; ino->i_size = i_size; - ino->i_blksize = i_blksize; ino->i_blocks = i_blocks; return(0); } diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index 2807aa833e62..b52b7381d10f 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -76,7 +76,7 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe return NULL; } - qbh->data = data = (char *)kmalloc(2048, GFP_NOFS); + qbh->data = data = kmalloc(2048, GFP_NOFS); if (!data) { printk("HPFS: hpfs_map_4sectors: out of memory\n"); goto bail; diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index f687d54ed442..32ab51e42b96 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -12,7 +12,6 @@ #include <linux/mutex.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> -#include <linux/hpfs_fs.h> #include <linux/slab.h> #include <linux/smp_lock.h> diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 56f2c338c4d9..bcf6ee36e065 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -17,7 +17,6 @@ void hpfs_init_inode(struct inode *i) i->i_gid = hpfs_sb(sb)->sb_gid; i->i_mode = hpfs_sb(sb)->sb_mode; hpfs_inode->i_conv = hpfs_sb(sb)->sb_conv; - i->i_blksize = 512; i->i_size = -1; i->i_blocks = -1; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index f798480a363f..450b5e0b4785 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -11,6 +11,7 @@ #include <linux/parser.h> #include <linux/init.h> #include <linux/statfs.h> +#include <linux/magic.h> /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ @@ -202,8 +203,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(hpfs_inode_cachep)) - printk(KERN_INFO "hpfs_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(hpfs_inode_cachep); } /* @@ -461,11 +461,10 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) int o; - sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; s->s_fs_info = sbi; - memset(sbi, 0, sizeof(*sbi)); sbi->sb_bmp_dir = NULL; sbi->sb_cp_table = NULL; diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c index 3a9bdf58166f..dcb6d2e988b8 100644 --- a/fs/hppfs/hppfs_kern.c +++ b/fs/hppfs/hppfs_kern.c @@ -152,7 +152,6 @@ static void hppfs_read_inode(struct inode *ino) ino->i_mode = proc_ino->i_mode; ino->i_nlink = proc_ino->i_nlink; ino->i_size = proc_ino->i_size; - ino->i_blksize = proc_ino->i_blksize; ino->i_blocks = proc_ino->i_blocks; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c3920c96dadf..f5b8f329aca6 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -229,7 +229,7 @@ static void hugetlbfs_delete_inode(struct inode *inode) clear_inode(inode); } -static void hugetlbfs_forget_inode(struct inode *inode) +static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock) { struct super_block *sb = inode->i_sb; @@ -357,7 +357,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode->i_mode = mode; inode->i_uid = uid; inode->i_gid = gid; - inode->i_blksize = HPAGE_SIZE; inode->i_blocks = 0; inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; diff --git a/fs/inode.c b/fs/inode.c index 0bf9f0444a96..abf77471e6c4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -133,7 +133,6 @@ static struct inode *alloc_inode(struct super_block *sb) inode->i_bdev = NULL; inode->i_cdev = NULL; inode->i_rdev = 0; - inode->i_security = NULL; inode->dirtied_when = 0; if (security_inode_alloc(inode)) { if (inode->i_sb->s_op->destroy_inode) @@ -163,7 +162,7 @@ static struct inode *alloc_inode(struct super_block *sb) bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; mapping->backing_dev_info = bdi; } - memset(&inode->u, 0, sizeof(inode->u)); + inode->i_private = 0; inode->i_mapping = mapping; } return inode; @@ -254,9 +253,9 @@ void clear_inode(struct inode *inode) DQUOT_DROP(inode); if (inode->i_sb && inode->i_sb->s_op->clear_inode) inode->i_sb->s_op->clear_inode(inode); - if (inode->i_bdev) + if (S_ISBLK(inode->i_mode) && inode->i_bdev) bd_forget(inode); - if (inode->i_cdev) + if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode); inode->i_state = I_CLEAR; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 14391361c886..c34b862cdbf2 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -96,9 +96,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(isofs_inode_cachep)) - printk(KERN_INFO "iso_inode_cache: not all structures were " - "freed\n"); + kmem_cache_destroy(isofs_inode_cachep); } static int isofs_remount(struct super_block *sb, int *flags, char *data) @@ -557,11 +555,10 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) struct iso9660_options opt; struct isofs_sb_info * sbi; - sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; s->s_fs_info = sbi; - memset(sbi, 0, sizeof(*sbi)); if (!parse_options((char *)data, &opt)) goto out_freesbi; @@ -963,30 +960,30 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s, goto abort; } - if (nextblk) { - while (b_off >= (offset + sect_size)) { - struct inode *ninode; - - offset += sect_size; - if (nextblk == 0) - goto abort; - ninode = isofs_iget(inode->i_sb, nextblk, nextoff); - if (!ninode) - goto abort; - firstext = ISOFS_I(ninode)->i_first_extent; - sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode); - nextblk = ISOFS_I(ninode)->i_next_section_block; - nextoff = ISOFS_I(ninode)->i_next_section_offset; - iput(ninode); - - if (++section > 100) { - printk("isofs_get_blocks: More than 100 file sections ?!?, aborting...\n"); - printk("isofs_get_blocks: block=%ld firstext=%u sect_size=%u " - "nextblk=%lu nextoff=%lu\n", - iblock, firstext, (unsigned) sect_size, - nextblk, nextoff); - goto abort; - } + /* On the last section, nextblk == 0, section size is likely to + * exceed sect_size by a partial block, and access beyond the + * end of the file will reach beyond the section size, too. + */ + while (nextblk && (b_off >= (offset + sect_size))) { + struct inode *ninode; + + offset += sect_size; + ninode = isofs_iget(inode->i_sb, nextblk, nextoff); + if (!ninode) + goto abort; + firstext = ISOFS_I(ninode)->i_first_extent; + sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode); + nextblk = ISOFS_I(ninode)->i_next_section_block; + nextoff = ISOFS_I(ninode)->i_next_section_offset; + iput(ninode); + + if (++section > 100) { + printk("isofs_get_blocks: More than 100 file sections ?!?, aborting...\n"); + printk("isofs_get_blocks: block=%ld firstext=%u sect_size=%u " + "nextblk=%lu nextoff=%lu\n", + iblock, firstext, (unsigned) sect_size, + nextblk, nextoff); + goto abort; } } @@ -1238,7 +1235,7 @@ static void isofs_read_inode(struct inode *inode) } inode->i_uid = sbi->s_uid; inode->i_gid = sbi->s_gid; - inode->i_blocks = inode->i_blksize = 0; + inode->i_blocks = 0; ei->i_format_parm[0] = 0; ei->i_format_parm[1] = 0; @@ -1294,7 +1291,6 @@ static void isofs_read_inode(struct inode *inode) isonum_711 (de->ext_attr_length)); /* Set the number of blocks for stat() - should be done before RR */ - inode->i_blksize = PAGE_CACHE_SIZE; /* For stat() only */ inode->i_blocks = (inode->i_size + 511) >> 9; /* diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 47678a26c13b..0208cc7ac5d0 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -1,6 +1,6 @@ /* * linux/fs/checkpoint.c - * + * * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 * * Copyright 1999 Red Hat Software --- All Rights Reserved @@ -9,8 +9,8 @@ * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. * - * Checkpoint routines for the generic filesystem journaling code. - * Part of the ext2fs journaling system. + * Checkpoint routines for the generic filesystem journaling code. + * Part of the ext2fs journaling system. * * Checkpointing is the process of ensuring that a section of the log is * committed fully to disk, so that that portion of the log can be @@ -145,6 +145,7 @@ void __log_wait_for_space(journal_t *journal) * jbd_unlock_bh_state(). */ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) + __releases(journal->j_list_lock) { get_bh(bh); spin_unlock(&journal->j_list_lock); @@ -225,7 +226,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Try to flush one buffer from the checkpoint list to disk. * * Return 1 if something happened which requires us to abort the current - * scan of the checkpoint list. + * scan of the checkpoint list. * * Called with j_list_lock held and drops it if 1 is returned * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it @@ -269,7 +270,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, * possibly block, while still holding the journal lock. * We cannot afford to let the transaction logic start * messing around with this buffer before we write it to - * disk, as that would break recoverability. + * disk, as that would break recoverability. */ BUFFER_TRACE(bh, "queue"); get_bh(bh); @@ -292,7 +293,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, * Perform an actual checkpoint. We take the first transaction on the * list of transactions to be checkpointed and send all its buffers * to disk. We submit larger chunks of data at once. - * + * * The journal should be locked before calling this function. */ int log_do_checkpoint(journal_t *journal) @@ -303,10 +304,10 @@ int log_do_checkpoint(journal_t *journal) jbd_debug(1, "Start checkpoint\n"); - /* + /* * First thing: if there are any transactions in the log which * don't need checkpointing, just eliminate them from the - * journal straight away. + * journal straight away. */ result = cleanup_journal_tail(journal); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); @@ -384,9 +385,9 @@ out: * we have already got rid of any since the last update of the log tail * in the journal superblock. If so, we can instantly roll the * superblock forward to remove those transactions from the log. - * + * * Return <0 on error, 0 on success, 1 if there was nothing to clean up. - * + * * Called with the journal lock held. * * This is the only part of the journaling code which really needs to be @@ -403,8 +404,8 @@ int cleanup_journal_tail(journal_t *journal) unsigned long blocknr, freed; /* OK, work out the oldest transaction remaining in the log, and - * the log block it starts at. - * + * the log block it starts at. + * * If the log is now empty, we need to work out which is the * next transaction ID we will write, and where it will * start. */ @@ -479,7 +480,7 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) if (!jh) return 0; - last_jh = jh->b_cpprev; + last_jh = jh->b_cpprev; do { jh = next_jh; next_jh = jh->b_cpnext; @@ -557,7 +558,7 @@ out: return ret; } -/* +/* * journal_remove_checkpoint: called after a buffer has been committed * to disk (either by being write-back flushed to disk, or being * committed to the log). @@ -635,7 +636,7 @@ out: * Called with the journal locked. * Called with j_list_lock held. */ -void __journal_insert_checkpoint(struct journal_head *jh, +void __journal_insert_checkpoint(struct journal_head *jh, transaction_t *transaction) { JBUFFER_TRACE(jh, "entry"); @@ -657,7 +658,7 @@ void __journal_insert_checkpoint(struct journal_head *jh, /* * We've finished with this transaction structure: adios... - * + * * The transaction must have no links except for the checkpoint by this * point. * diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 42da60784311..32a8caf0c41e 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -160,6 +160,117 @@ static int journal_write_commit_record(journal_t *journal, return (ret == -EIO); } +static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) +{ + int i; + + for (i = 0; i < bufs; i++) { + wbuf[i]->b_end_io = end_buffer_write_sync; + /* We use-up our safety reference in submit_bh() */ + submit_bh(WRITE, wbuf[i]); + } +} + +/* + * Submit all the data buffers to disk + */ +static void journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction) +{ + struct journal_head *jh; + struct buffer_head *bh; + int locked; + int bufs = 0; + struct buffer_head **wbuf = journal->j_wbuf; + + /* + * Whenever we unlock the journal and sleep, things can get added + * onto ->t_sync_datalist, so we have to keep looping back to + * write_out_data until we *know* that the list is empty. + * + * Cleanup any flushed data buffers from the data list. Even in + * abort mode, we want to flush this out as soon as possible. + */ +write_out_data: + cond_resched(); + spin_lock(&journal->j_list_lock); + + while (commit_transaction->t_sync_datalist) { + jh = commit_transaction->t_sync_datalist; + bh = jh2bh(jh); + locked = 0; + + /* Get reference just to make sure buffer does not disappear + * when we are forced to drop various locks */ + get_bh(bh); + /* If the buffer is dirty, we need to submit IO and hence + * we need the buffer lock. We try to lock the buffer without + * blocking. If we fail, we need to drop j_list_lock and do + * blocking lock_buffer(). + */ + if (buffer_dirty(bh)) { + if (test_set_buffer_locked(bh)) { + BUFFER_TRACE(bh, "needs blocking lock"); + spin_unlock(&journal->j_list_lock); + /* Write out all data to prevent deadlocks */ + journal_do_submit_data(wbuf, bufs); + bufs = 0; + lock_buffer(bh); + spin_lock(&journal->j_list_lock); + } + locked = 1; + } + /* We have to get bh_state lock. Again out of order, sigh. */ + if (!inverted_lock(journal, bh)) { + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + } + /* Someone already cleaned up the buffer? */ + if (!buffer_jbd(bh) + || jh->b_transaction != commit_transaction + || jh->b_jlist != BJ_SyncData) { + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + BUFFER_TRACE(bh, "already cleaned up"); + put_bh(bh); + continue; + } + if (locked && test_clear_buffer_dirty(bh)) { + BUFFER_TRACE(bh, "needs writeout, adding to array"); + wbuf[bufs++] = bh; + __journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + if (bufs == journal->j_wbufsize) { + spin_unlock(&journal->j_list_lock); + journal_do_submit_data(wbuf, bufs); + bufs = 0; + goto write_out_data; + } + } + else { + BUFFER_TRACE(bh, "writeout complete: unfile"); + __journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + journal_remove_journal_head(bh); + /* Once for our safety reference, once for + * journal_remove_journal_head() */ + put_bh(bh); + put_bh(bh); + } + + if (lock_need_resched(&journal->j_list_lock)) { + spin_unlock(&journal->j_list_lock); + goto write_out_data; + } + } + spin_unlock(&journal->j_list_lock); + journal_do_submit_data(wbuf, bufs); +} + /* * journal_commit_transaction * @@ -313,80 +424,13 @@ void journal_commit_transaction(journal_t *journal) * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - err = 0; - /* - * Whenever we unlock the journal and sleep, things can get added - * onto ->t_sync_datalist, so we have to keep looping back to - * write_out_data until we *know* that the list is empty. - */ - bufs = 0; - /* - * Cleanup any flushed data buffers from the data list. Even in - * abort mode, we want to flush this out as soon as possible. - */ -write_out_data: - cond_resched(); - spin_lock(&journal->j_list_lock); - - while (commit_transaction->t_sync_datalist) { - struct buffer_head *bh; - - jh = commit_transaction->t_sync_datalist; - commit_transaction->t_sync_datalist = jh->b_tnext; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - BUFFER_TRACE(bh, "locked"); - if (!inverted_lock(journal, bh)) - goto write_out_data; - __journal_temp_unlink_buffer(jh); - __journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - if (lock_need_resched(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; - } - } else { - if (buffer_dirty(bh)) { - BUFFER_TRACE(bh, "start journal writeout"); - get_bh(bh); - wbuf[bufs++] = bh; - if (bufs == journal->j_wbufsize) { - jbd_debug(2, "submit %d writes\n", - bufs); - spin_unlock(&journal->j_list_lock); - ll_rw_block(SWRITE, bufs, wbuf); - journal_brelse_array(wbuf, bufs); - bufs = 0; - goto write_out_data; - } - } else { - BUFFER_TRACE(bh, "writeout complete: unfile"); - if (!inverted_lock(journal, bh)) - goto write_out_data; - __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - if (lock_need_resched(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; - } - } - } - } - - if (bufs) { - spin_unlock(&journal->j_list_lock); - ll_rw_block(SWRITE, bufs, wbuf); - journal_brelse_array(wbuf, bufs); - spin_lock(&journal->j_list_lock); - } + journal_submit_data_buffers(journal, commit_transaction); /* * Wait for all previously submitted IO to complete. */ + spin_lock(&journal->j_list_lock); while (commit_transaction->t_locked_list) { struct buffer_head *bh; diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index f66724ce443a..7af6099c911c 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -181,7 +181,7 @@ loop: transaction->t_expires)) should_sleep = 0; if (journal->j_flags & JFS_UNMOUNT) - should_sleep = 0; + should_sleep = 0; if (should_sleep) { spin_unlock(&journal->j_state_lock); schedule(); @@ -271,7 +271,7 @@ static void journal_kill_thread(journal_t *journal) int journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct journal_head **jh_out, - int blocknr) + unsigned long blocknr) { int need_copy_out = 0; int done_copy_out = 0; @@ -578,7 +578,7 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp) * this is a no-op. If needed, we can use j_blk_offset - everything is * ready. */ -int journal_bmap(journal_t *journal, unsigned long blocknr, +int journal_bmap(journal_t *journal, unsigned long blocknr, unsigned long *retp) { int err = 0; @@ -696,13 +696,13 @@ fail: * @bdev: Block device on which to create the journal * @fs_dev: Device which hold journalled filesystem for this journal. * @start: Block nr Start of journal. - * @len: Lenght of the journal in blocks. + * @len: Length of the journal in blocks. * @blocksize: blocksize of journalling device * @returns: a newly created journal_t * - * + * * journal_init_dev creates a journal which maps a fixed contiguous * range of blocks on an arbitrary block device. - * + * */ journal_t * journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, @@ -715,18 +715,8 @@ journal_t * journal_init_dev(struct block_device *bdev, if (!journal) return NULL; - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_maxlen = len; - journal->j_blocksize = blocksize; - - bh = __getblk(journal->j_dev, start, journal->j_blocksize); - J_ASSERT(bh != NULL); - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; - /* journal descriptor can store up to n blocks -bzzz */ + journal->j_blocksize = blocksize; n = journal->j_blocksize / sizeof(journal_block_tag_t); journal->j_wbufsize = n; journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); @@ -736,14 +726,23 @@ journal_t * journal_init_dev(struct block_device *bdev, kfree(journal); journal = NULL; } + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + + bh = __getblk(journal->j_dev, start, journal->j_blocksize); + J_ASSERT(bh != NULL); + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; return journal; } - -/** + +/** * journal_t * journal_init_inode () - creates a journal which maps to a inode. * @inode: An inode to create the journal in - * + * * journal_init_inode creates a journal which maps an on-disk inode as * the journal. The inode must exist already, must support bmap() and * must have all data blocks preallocated. @@ -763,7 +762,7 @@ journal_t * journal_init_inode (struct inode *inode) journal->j_inode = inode; jbd_debug(1, "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", - journal, inode->i_sb->s_id, inode->i_ino, + journal, inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size, inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); @@ -798,10 +797,10 @@ journal_t * journal_init_inode (struct inode *inode) return journal; } -/* +/* * If the journal init or create aborts, we need to mark the journal * superblock as being NULL to prevent the journal destroy from writing - * back a bogus superblock. + * back a bogus superblock. */ static void journal_fail_superblock (journal_t *journal) { @@ -820,7 +819,7 @@ static void journal_fail_superblock (journal_t *journal) static int journal_reset(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; - unsigned int first, last; + unsigned long first, last; first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); @@ -844,13 +843,13 @@ static int journal_reset(journal_t *journal) return 0; } -/** +/** * int journal_create() - Initialise the new journal file * @journal: Journal to create. This structure must have been initialised - * + * * Given a journal_t structure which tells us which disk blocks we can * use, create a new journal superblock and initialise all of the - * journal fields from scratch. + * journal fields from scratch. **/ int journal_create(journal_t *journal) { @@ -915,7 +914,7 @@ int journal_create(journal_t *journal) return journal_reset(journal); } -/** +/** * void journal_update_superblock() - Update journal sb on disk. * @journal: The journal to update. * @wait: Set to '0' if you don't want to wait for IO completion. @@ -939,7 +938,7 @@ void journal_update_superblock(journal_t *journal, int wait) journal->j_transaction_sequence) { jbd_debug(1,"JBD: Skipping superblock update on recovered sb " "(start %ld, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, + journal->j_tail, journal->j_tail_sequence, journal->j_errno); goto out; } @@ -1062,7 +1061,7 @@ static int load_superblock(journal_t *journal) /** * int journal_load() - Read journal from disk. * @journal: Journal to act on. - * + * * Given a journal_t structure which tells us which disk blocks contain * a journal, read the journal from disk to initialise the in-memory * structures. @@ -1094,7 +1093,7 @@ int journal_load(journal_t *journal) /* * Create a slab for this blocksize */ - err = journal_create_jbd_slab(cpu_to_be32(sb->s_blocksize)); + err = journal_create_jbd_slab(be32_to_cpu(sb->s_blocksize)); if (err) return err; @@ -1172,9 +1171,9 @@ void journal_destroy(journal_t *journal) * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount * @incompat: bitmask of incompatible features - * + * * Check whether the journal uses all of a given set of - * features. Return true (non-zero) if it does. + * features. Return true (non-zero) if it does. **/ int journal_check_used_features (journal_t *journal, unsigned long compat, @@ -1203,7 +1202,7 @@ int journal_check_used_features (journal_t *journal, unsigned long compat, * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount * @incompat: bitmask of incompatible features - * + * * Check whether the journaling code supports the use of * all of a given set of features on this journal. Return true * (non-zero) if it can. */ @@ -1241,7 +1240,7 @@ int journal_check_available_features (journal_t *journal, unsigned long compat, * @incompat: bitmask of incompatible features * * Mark a given journal feature as present on the - * superblock. Returns true if the requested features could be set. + * superblock. Returns true if the requested features could be set. * */ @@ -1327,7 +1326,7 @@ static int journal_convert_superblock_v1(journal_t *journal, /** * int journal_flush () - Flush journal * @journal: Journal to act on. - * + * * Flush all data for a given journal to disk and empty the journal. * Filesystems can use this when remounting readonly to ensure that * recovery does not need to happen on remount. @@ -1394,7 +1393,7 @@ int journal_flush(journal_t *journal) * int journal_wipe() - Wipe journal contents * @journal: Journal to act on. * @write: flag (see below) - * + * * Wipe out all of the contents of a journal, safely. This will produce * a warning if the journal contains any valid recovery information. * Must be called between journal_init_*() and journal_load(). @@ -1449,7 +1448,7 @@ static const char *journal_dev_name(journal_t *journal, char *buffer) /* * Journal abort has very specific semantics, which we describe - * for journal abort. + * for journal abort. * * Two internal function, which provide abort to te jbd layer * itself are here. @@ -1504,7 +1503,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) * Perform a complete, immediate shutdown of the ENTIRE * journal (not of a single transaction). This operation cannot be * undone without closing and reopening the journal. - * + * * The journal_abort function is intended to support higher level error * recovery mechanisms such as the ext2/ext3 remount-readonly error * mode. @@ -1538,7 +1537,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) * supply an errno; a null errno implies that absolutely no further * writes are done to the journal (unless there are any already in * progress). - * + * */ void journal_abort(journal_t *journal, int errno) @@ -1546,7 +1545,7 @@ void journal_abort(journal_t *journal, int errno) __journal_abort_soft(journal, errno); } -/** +/** * int journal_errno () - returns the journal's error state. * @journal: journal to examine. * @@ -1570,7 +1569,7 @@ int journal_errno(journal_t *journal) return err; } -/** +/** * int journal_clear_err () - clears the journal's error state * @journal: journal to act on. * @@ -1590,7 +1589,7 @@ int journal_clear_err(journal_t *journal) return err; } -/** +/** * void journal_ack_err() - Ack journal err. * @journal: journal to act on. * @@ -1612,7 +1611,7 @@ int journal_blocks_per_page(struct inode *inode) /* * Simple support for retrying memory allocations. Introduced to help to - * debug different VM deadlock avoidance strategies. + * debug different VM deadlock avoidance strategies. */ void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry) { @@ -2047,13 +2046,7 @@ static int __init journal_init(void) { int ret; -/* Static check for data structure consistency. There's no code - * invoked --- we'll just get a linker failure if things aren't right. - */ - extern void journal_bad_superblock_size(void); - if (sizeof(struct journal_superblock_s) != 1024) - journal_bad_superblock_size(); - + BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); ret = journal_init_caches(); if (ret != 0) diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index de5bafb4e853..11563fe2a52b 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -1,6 +1,6 @@ /* * linux/fs/recovery.c - * + * * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 * * Copyright 1999-2000 Red Hat Software --- All Rights Reserved @@ -10,7 +10,7 @@ * option, any later version, incorporated herein by reference. * * Journal recovery routines for the generic filesystem journaling code; - * part of the ext2fs journaling system. + * part of the ext2fs journaling system. */ #ifndef __KERNEL__ @@ -25,9 +25,9 @@ /* * Maintain information about the progress of the recovery job, so that - * the different passes can carry information between them. + * the different passes can carry information between them. */ -struct recovery_info +struct recovery_info { tid_t start_transaction; tid_t end_transaction; @@ -46,7 +46,7 @@ static int scan_revoke_records(journal_t *, struct buffer_head *, #ifdef __KERNEL__ /* Release readahead buffers after use */ -void journal_brelse_array(struct buffer_head *b[], int n) +static void journal_brelse_array(struct buffer_head *b[], int n) { while (--n >= 0) brelse (b[n]); @@ -116,7 +116,7 @@ static int do_readahead(journal_t *journal, unsigned int start) err = 0; failed: - if (nbufs) + if (nbufs) journal_brelse_array(bufs, nbufs); return err; } @@ -128,7 +128,7 @@ failed: * Read a block from the journal */ -static int jread(struct buffer_head **bhp, journal_t *journal, +static int jread(struct buffer_head **bhp, journal_t *journal, unsigned int offset) { int err; @@ -212,14 +212,14 @@ do { \ /** * journal_recover - recovers a on-disk journal * @journal: the journal to recover - * + * * The primary function for recovering the log contents when mounting a - * journaled device. + * journaled device. * * Recovery is done in three passes. In the first pass, we look for the * end of the log. In the second, we assemble the list of revoke * blocks. In the third and final pass, we replay any un-revoked blocks - * in the log. + * in the log. */ int journal_recover(journal_t *journal) { @@ -231,10 +231,10 @@ int journal_recover(journal_t *journal) memset(&info, 0, sizeof(info)); sb = journal->j_superblock; - /* + /* * The journal superblock's s_start field (the current log head) * is always zero if, and only if, the journal was cleanly - * unmounted. + * unmounted. */ if (!sb->s_start) { @@ -253,7 +253,7 @@ int journal_recover(journal_t *journal) jbd_debug(0, "JBD: recovery, exit status %d, " "recovered transactions %u to %u\n", err, info.start_transaction, info.end_transaction); - jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", + jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", info.nr_replays, info.nr_revoke_hits, info.nr_revokes); /* Restart the log at the next transaction ID, thus invalidating @@ -268,15 +268,15 @@ int journal_recover(journal_t *journal) /** * journal_skip_recovery - Start journal and wipe exiting records * @journal: journal to startup - * + * * Locate any valid recovery information from the journal and set up the * journal structures in memory to ignore it (presumably because the - * caller has evidence that it is out of date). + * caller has evidence that it is out of date). * This function does'nt appear to be exorted.. * * We perform one pass over the journal to allow us to tell the user how * much recovery information is being erased, and to let us initialise - * the journal transaction sequence numbers to the next unused ID. + * the journal transaction sequence numbers to the next unused ID. */ int journal_skip_recovery(journal_t *journal) { @@ -297,7 +297,7 @@ int journal_skip_recovery(journal_t *journal) #ifdef CONFIG_JBD_DEBUG int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); #endif - jbd_debug(0, + jbd_debug(0, "JBD: ignoring %d transaction%s from the journal.\n", dropped, (dropped == 1) ? "" : "s"); journal->j_transaction_sequence = ++info.end_transaction; @@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal, unsigned long next_log_block; int err, success = 0; journal_superblock_t * sb; - journal_header_t * tmp; + journal_header_t * tmp; struct buffer_head * bh; unsigned int sequence; int blocktype; @@ -324,10 +324,10 @@ static int do_one_pass(journal_t *journal, MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) / sizeof(journal_block_tag_t)); - /* + /* * First thing is to establish what we expect to find in the log * (in terms of transaction IDs), and where (in terms of log - * block offsets): query the superblock. + * block offsets): query the superblock. */ sb = journal->j_superblock; @@ -344,7 +344,7 @@ static int do_one_pass(journal_t *journal, * Now we walk through the log, transaction by transaction, * making sure that each transaction has a commit block in the * expected place. Each complete transaction gets replayed back - * into the main filesystem. + * into the main filesystem. */ while (1) { @@ -379,8 +379,8 @@ static int do_one_pass(journal_t *journal, next_log_block++; wrap(journal, next_log_block); - /* What kind of buffer is it? - * + /* What kind of buffer is it? + * * If it is a descriptor block, check that it has the * expected sequence number. Otherwise, we're all done * here. */ @@ -394,7 +394,7 @@ static int do_one_pass(journal_t *journal, blocktype = be32_to_cpu(tmp->h_blocktype); sequence = be32_to_cpu(tmp->h_sequence); - jbd_debug(3, "Found magic %d, sequence %d\n", + jbd_debug(3, "Found magic %d, sequence %d\n", blocktype, sequence); if (sequence != next_commit_ID) { @@ -438,7 +438,7 @@ static int do_one_pass(journal_t *journal, /* Recover what we can, but * report failure at the end. */ success = err; - printk (KERN_ERR + printk (KERN_ERR "JBD: IO error %d recovering " "block %ld in log\n", err, io_block); @@ -452,7 +452,7 @@ static int do_one_pass(journal_t *journal, * revoked, then we're all done * here. */ if (journal_test_revoke - (journal, blocknr, + (journal, blocknr, next_commit_ID)) { brelse(obh); ++info->nr_revoke_hits; @@ -465,7 +465,7 @@ static int do_one_pass(journal_t *journal, blocknr, journal->j_blocksize); if (nbh == NULL) { - printk(KERN_ERR + printk(KERN_ERR "JBD: Out of memory " "during recovery.\n"); err = -ENOMEM; @@ -537,7 +537,7 @@ static int do_one_pass(journal_t *journal, } done: - /* + /* * We broke out of the log scan loop: either we came to the * known end of the log or we found an unexpected block in the * log. If the latter happened, then we know that the "current" @@ -567,7 +567,7 @@ static int do_one_pass(journal_t *journal, /* Scan a revoke record, marking all blocks mentioned as revoked. */ -static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, +static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, tid_t sequence, struct recovery_info *info) { journal_revoke_header_t *header; diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index a56144183462..c532429d8d9b 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -1,6 +1,6 @@ /* * linux/fs/revoke.c - * + * * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 * * Copyright 2000 Red Hat corp --- All Rights Reserved @@ -15,10 +15,10 @@ * Revoke is the mechanism used to prevent old log records for deleted * metadata from being replayed on top of newer data using the same * blocks. The revoke mechanism is used in two separate places: - * + * * + Commit: during commit we write the entire list of the current * transaction's revoked blocks to the journal - * + * * + Recovery: during recovery we record the transaction ID of all * revoked blocks. If there are multiple revoke records in the log * for a single block, only the last one counts, and if there is a log @@ -29,7 +29,7 @@ * single transaction: * * Block is revoked and then journaled: - * The desired end result is the journaling of the new block, so we + * The desired end result is the journaling of the new block, so we * cancel the revoke before the transaction commits. * * Block is journaled and then revoked: @@ -41,7 +41,7 @@ * transaction must have happened after the block was journaled and so * the revoke must take precedence. * - * Block is revoked and then written as data: + * Block is revoked and then written as data: * The data write is allowed to succeed, but the revoke is _not_ * cancelled. We still need to prevent old log records from * overwriting the new data. We don't even need to clear the revoke @@ -54,7 +54,7 @@ * buffer has not been revoked, and cancel_revoke * need do nothing. * RevokeValid set, Revoked set: - * buffer has been revoked. + * buffer has been revoked. */ #ifndef __KERNEL__ @@ -77,7 +77,7 @@ static kmem_cache_t *revoke_table_cache; journal replay, this involves recording the transaction ID of the last transaction to revoke this block. */ -struct jbd_revoke_record_s +struct jbd_revoke_record_s { struct list_head hash; tid_t sequence; /* Used for recovery only */ @@ -90,8 +90,8 @@ struct jbd_revoke_table_s { /* It is conceivable that we might want a larger hash table * for recovery. Must be a power of two. */ - int hash_size; - int hash_shift; + int hash_size; + int hash_shift; struct list_head *hash_table; }; @@ -301,22 +301,22 @@ void journal_destroy_revoke(journal_t *journal) #ifdef __KERNEL__ -/* +/* * journal_revoke: revoke a given buffer_head from the journal. This * prevents the block from being replayed during recovery if we take a * crash after this current transaction commits. Any subsequent * metadata writes of the buffer in this transaction cancel the - * revoke. + * revoke. * * Note that this call may block --- it is up to the caller to make * sure that there are no further calls to journal_write_metadata * before the revoke is complete. In ext3, this implies calling the * revoke before clearing the block bitmap when we are deleting - * metadata. + * metadata. * * Revoke performs a journal_forget on any buffer_head passed in as a * parameter, but does _not_ forget the buffer_head if the bh was only - * found implicitly. + * found implicitly. * * bh_in may not be a journalled buffer - it may have come off * the hash tables without an attached journal_head. @@ -325,7 +325,7 @@ void journal_destroy_revoke(journal_t *journal) * by one. */ -int journal_revoke(handle_t *handle, unsigned long blocknr, +int journal_revoke(handle_t *handle, unsigned long blocknr, struct buffer_head *bh_in) { struct buffer_head *bh = NULL; @@ -487,7 +487,7 @@ void journal_switch_revoke_table(journal_t *journal) else journal->j_revoke = journal->j_revoke_table[0]; - for (i = 0; i < journal->j_revoke->hash_size; i++) + for (i = 0; i < journal->j_revoke->hash_size; i++) INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); } @@ -498,7 +498,7 @@ void journal_switch_revoke_table(journal_t *journal) * Called with the journal lock held. */ -void journal_write_revoke_records(journal_t *journal, +void journal_write_revoke_records(journal_t *journal, transaction_t *transaction) { struct journal_head *descriptor; @@ -507,7 +507,7 @@ void journal_write_revoke_records(journal_t *journal, struct list_head *hash_list; int i, offset, count; - descriptor = NULL; + descriptor = NULL; offset = 0; count = 0; @@ -519,10 +519,10 @@ void journal_write_revoke_records(journal_t *journal, hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { - record = (struct jbd_revoke_record_s *) + record = (struct jbd_revoke_record_s *) hash_list->next; write_one_revoke_record(journal, transaction, - &descriptor, &offset, + &descriptor, &offset, record); count++; list_del(&record->hash); @@ -534,14 +534,14 @@ void journal_write_revoke_records(journal_t *journal, jbd_debug(1, "Wrote %d revoke records\n", count); } -/* +/* * Write out one revoke record. We need to create a new descriptor - * block if the old one is full or if we have not already created one. + * block if the old one is full or if we have not already created one. */ -static void write_one_revoke_record(journal_t *journal, +static void write_one_revoke_record(journal_t *journal, transaction_t *transaction, - struct journal_head **descriptorp, + struct journal_head **descriptorp, int *offsetp, struct jbd_revoke_record_s *record) { @@ -584,21 +584,21 @@ static void write_one_revoke_record(journal_t *journal, *descriptorp = descriptor; } - * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = + * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = cpu_to_be32(record->blocknr); offset += 4; *offsetp = offset; } -/* +/* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to * be waited for during commit, so it has to go onto the appropriate * journal buffer list. */ -static void flush_descriptor(journal_t *journal, - struct journal_head *descriptor, +static void flush_descriptor(journal_t *journal, + struct journal_head *descriptor, int offset) { journal_revoke_header_t *header; @@ -618,7 +618,7 @@ static void flush_descriptor(journal_t *journal, } #endif -/* +/* * Revoke support for recovery. * * Recovery needs to be able to: @@ -629,7 +629,7 @@ static void flush_descriptor(journal_t *journal, * check whether a given block in a given transaction should be replayed * (ie. has not been revoked by a revoke record in that or a subsequent * transaction) - * + * * empty the revoke table after recovery. */ @@ -637,11 +637,11 @@ static void flush_descriptor(journal_t *journal, * First, setting revoke records. We create a new revoke record for * every block ever revoked in the log as we scan it for recovery, and * we update the existing records if we find multiple revokes for a - * single block. + * single block. */ -int journal_set_revoke(journal_t *journal, - unsigned long blocknr, +int journal_set_revoke(journal_t *journal, + unsigned long blocknr, tid_t sequence) { struct jbd_revoke_record_s *record; @@ -653,18 +653,18 @@ int journal_set_revoke(journal_t *journal, if (tid_gt(sequence, record->sequence)) record->sequence = sequence; return 0; - } + } return insert_revoke_hash(journal, blocknr, sequence); } -/* +/* * Test revoke records. For a given block referenced in the log, has * that block been revoked? A revoke record with a given transaction * sequence number revokes all blocks in that transaction and earlier * ones, but later transactions still need replayed. */ -int journal_test_revoke(journal_t *journal, +int journal_test_revoke(journal_t *journal, unsigned long blocknr, tid_t sequence) { diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index f5169a96260e..e1b3c8af4d17 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1,6 +1,6 @@ /* * linux/fs/transaction.c - * + * * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 * * Copyright 1998 Red Hat corp --- All Rights Reserved @@ -10,7 +10,7 @@ * option, any later version, incorporated herein by reference. * * Generic filesystem transaction handling code; part of the ext2fs - * journaling system. + * journaling system. * * This file manages transactions (compound commits managed by the * journaling code) and handles (individual atomic operations by the @@ -74,7 +74,7 @@ get_transaction(journal_t *journal, transaction_t *transaction) * start_this_handle: Given a handle, deal with any locking or stalling * needed to make sure that there is enough journal space for the handle * to begin. Attach the handle to a transaction and set up the - * transaction's buffer credits. + * transaction's buffer credits. */ static int start_this_handle(journal_t *journal, handle_t *handle) @@ -117,7 +117,7 @@ repeat_locked: if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { spin_unlock(&journal->j_state_lock); - ret = -EROFS; + ret = -EROFS; goto out; } @@ -182,7 +182,7 @@ repeat_locked: goto repeat; } - /* + /* * The commit code assumes that it can get enough log space * without forcing a checkpoint. This is *critical* for * correctness: a checkpoint of a buffer which is also @@ -191,7 +191,7 @@ repeat_locked: * * We must therefore ensure the necessary space in the journal * *before* starting to dirty potentially checkpointed buffers - * in the new transaction. + * in the new transaction. * * The worst part is, any transaction currently committing can * reduce the free space arbitrarily. Be careful to account for @@ -246,13 +246,13 @@ static handle_t *new_handle(int nblocks) } /** - * handle_t *journal_start() - Obtain a new handle. + * handle_t *journal_start() - Obtain a new handle. * @journal: Journal to start transaction on. * @nblocks: number of block buffer we might modify * * We make sure that the transaction can guarantee at least nblocks of * modified buffers in the log. We block until the log can guarantee - * that much space. + * that much space. * * This function is visible to journal users (like ext3fs), so is not * called with the journal already locked. @@ -292,11 +292,11 @@ handle_t *journal_start(journal_t *journal, int nblocks) * int journal_extend() - extend buffer credits. * @handle: handle to 'extend' * @nblocks: nr blocks to try to extend by. - * + * * Some transactions, such as large extends and truncates, can be done * atomically all at once or in several stages. The operation requests * a credit for a number of buffer modications in advance, but can - * extend its credit if it needs more. + * extend its credit if it needs more. * * journal_extend tries to give the running handle more buffer credits. * It does not guarantee that allocation - this is a best-effort only. @@ -363,7 +363,7 @@ out: * int journal_restart() - restart a handle . * @handle: handle to restart * @nblocks: nr credits requested - * + * * Restart a handle for a multi-transaction filesystem * operation. * @@ -462,7 +462,7 @@ void journal_lock_updates(journal_t *journal) /** * void journal_unlock_updates (journal_t* journal) - release barrier * @journal: Journal to release the barrier on. - * + * * Release a transaction barrier obtained with journal_lock_updates(). * * Should be called without the journal lock held. @@ -547,8 +547,8 @@ repeat: jbd_lock_bh_state(bh); /* We now hold the buffer lock so it is safe to query the buffer - * state. Is the buffer dirty? - * + * state. Is the buffer dirty? + * * If so, there are two possibilities. The buffer may be * non-journaled, and undergoing a quite legitimate writeback. * Otherwise, it is journaled, and we don't expect dirty buffers @@ -566,7 +566,7 @@ repeat: */ if (jh->b_transaction) { J_ASSERT_JH(jh, - jh->b_transaction == transaction || + jh->b_transaction == transaction || jh->b_transaction == journal->j_committing_transaction); if (jh->b_next_transaction) @@ -580,7 +580,7 @@ repeat: */ JBUFFER_TRACE(jh, "Unexpected dirty buffer"); jbd_unexpected_dirty_buffer(jh); - } + } unlock_buffer(bh); @@ -653,7 +653,7 @@ repeat: * buffer had better remain locked during the kmalloc, * but that should be true --- we hold the journal lock * still and the buffer is already on the BUF_JOURNAL - * list so won't be flushed. + * list so won't be flushed. * * Subtle point, though: if this is a get_undo_access, * then we will be relying on the frozen_data to contain @@ -765,8 +765,8 @@ int journal_get_write_access(handle_t *handle, struct buffer_head *bh) * manually rather than reading off disk), then we need to keep the * buffer_head locked until it has been completely filled with new * data. In this case, we should be able to make the assertion that - * the bh is not already part of an existing transaction. - * + * the bh is not already part of an existing transaction. + * * The buffer should already be locked by the caller by this point. * There is no lock ranking violation: it was a newly created, * unlocked buffer beforehand. */ @@ -778,7 +778,7 @@ int journal_get_write_access(handle_t *handle, struct buffer_head *bh) * * Call this if you create a new bh. */ -int journal_get_create_access(handle_t *handle, struct buffer_head *bh) +int journal_get_create_access(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; @@ -847,13 +847,13 @@ out: * do not reuse freed space until the deallocation has been committed, * since if we overwrote that space we would make the delete * un-rewindable in case of a crash. - * + * * To deal with that, journal_get_undo_access requests write access to a * buffer for parts of non-rewindable operations such as delete * operations on the bitmaps. The journaling code must keep a copy of * the buffer's contents prior to the undo_access call until such time * as we know that the buffer has definitely been committed to disk. - * + * * We never need to know which transaction the committed data is part * of, buffers touched here are guaranteed to be dirtied later and so * will be committed to a new transaction in due course, at which point @@ -911,13 +911,13 @@ out: return err; } -/** +/** * int journal_dirty_data() - mark a buffer as containing dirty data which * needs to be flushed before we can commit the - * current transaction. + * current transaction. * @handle: transaction * @bh: bufferhead to mark - * + * * The buffer is placed on the transaction's data list and is marked as * belonging to the transaction. * @@ -946,15 +946,15 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) /* * What if the buffer is already part of a running transaction? - * + * * There are two cases: * 1) It is part of the current running transaction. Refile it, * just in case we have allocated it as metadata, deallocated - * it, then reallocated it as data. + * it, then reallocated it as data. * 2) It is part of the previous, still-committing transaction. * If all we want to do is to guarantee that the buffer will be * written to disk before this new transaction commits, then - * being sure that the *previous* transaction has this same + * being sure that the *previous* transaction has this same * property is sufficient for us! Just leave it on its old * transaction. * @@ -1076,18 +1076,18 @@ no_journal: return 0; } -/** +/** * int journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. - * @bh: buffer to mark - * + * @bh: buffer to mark + * * mark dirty metadata which needs to be journaled as part of the current * transaction. * * The buffer is placed on the transaction's metadata list and is marked - * as belonging to the transaction. + * as belonging to the transaction. * - * Returns error number or 0 on success. + * Returns error number or 0 on success. * * Special care needs to be taken if the buffer already belongs to the * current committing transaction (in which case we should have frozen @@ -1135,11 +1135,11 @@ int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) set_buffer_jbddirty(bh); - /* + /* * Metadata already on the current transaction list doesn't * need to be filed. Metadata on another transaction's list must * be committing, and will be refiled once the commit completes: - * leave it alone for now. + * leave it alone for now. */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); @@ -1165,7 +1165,7 @@ out: return 0; } -/* +/* * journal_release_buffer: undo a get_write_access without any buffer * updates, if the update decided in the end that it didn't need access. * @@ -1176,20 +1176,20 @@ journal_release_buffer(handle_t *handle, struct buffer_head *bh) BUFFER_TRACE(bh, "entry"); } -/** +/** * void journal_forget() - bforget() for potentially-journaled buffers. * @handle: transaction handle * @bh: bh to 'forget' * * We can only do the bforget if there are no commits pending against the * buffer. If the buffer is dirty in the current running transaction we - * can safely unlink it. + * can safely unlink it. * * bh may not be a journalled buffer at all - it may be a non-JBD * buffer which came off the hashtable. Check for this. * * Decrements bh->b_count by one. - * + * * Allow this call even if the handle has aborted --- it may be part of * the caller's cleanup after an abort. */ @@ -1237,7 +1237,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) drop_reserve = 1; - /* + /* * We are no longer going to journal this buffer. * However, the commit of this transaction is still * important to the buffer: the delete that we are now @@ -1246,7 +1246,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) * * So, if we have a checkpoint on the buffer, we should * now refile the buffer on our BJ_Forget list so that - * we know to remove the checkpoint after we commit. + * we know to remove the checkpoint after we commit. */ if (jh->b_cp_transaction) { @@ -1264,7 +1264,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) } } } else if (jh->b_transaction) { - J_ASSERT_JH(jh, (jh->b_transaction == + J_ASSERT_JH(jh, (jh->b_transaction == journal->j_committing_transaction)); /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ @@ -1294,7 +1294,7 @@ drop: /** * int journal_stop() - complete a transaction * @handle: tranaction to complete. - * + * * All done for a particular handle. * * There is not much action needed here. We just return any remaining @@ -1303,7 +1303,7 @@ drop: * filesystem is marked for synchronous update. * * journal_stop itself will not usually return an error, but it may - * do so in unusual circumstances. In particular, expect it to + * do so in unusual circumstances. In particular, expect it to * return -EIO if a journal_abort has been executed since the * transaction began. */ @@ -1373,7 +1373,7 @@ int journal_stop(handle_t *handle) if (handle->h_sync || transaction->t_outstanding_credits > journal->j_max_transaction_buffers || - time_after_eq(jiffies, transaction->t_expires)) { + time_after_eq(jiffies, transaction->t_expires)) { /* Do this even for aborted journals: an abort still * completes the commit thread, it just doesn't write * anything to disk. */ @@ -1388,7 +1388,7 @@ int journal_stop(handle_t *handle) /* * Special case: JFS_SYNC synchronous updates require us - * to wait for the commit to complete. + * to wait for the commit to complete. */ if (handle->h_sync && !(current->flags & PF_MEMALLOC)) err = log_wait_commit(journal, tid); @@ -1439,7 +1439,7 @@ int journal_force_commit(journal_t *journal) * jbd_lock_bh_state(jh2bh(jh)) is held. */ -static inline void +static inline void __blist_add_buffer(struct journal_head **list, struct journal_head *jh) { if (!*list) { @@ -1454,7 +1454,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh) } } -/* +/* * Remove a buffer from a transaction list, given the transaction's list * head pointer. * @@ -1475,7 +1475,7 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) jh->b_tnext->b_tprev = jh->b_tprev; } -/* +/* * Remove a buffer from the appropriate transaction list. * * Note that this function can *change* the value of @@ -1595,17 +1595,17 @@ out: } -/** +/** * int journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @page: to try and free * @unused_gfp_mask: unused * - * + * * For all the buffers on this page, * if they are fully written out ordered data, move them onto BUF_CLEAN * so try_to_free_buffers() can reap them. - * + * * This function returns non-zero if we wish try_to_free_buffers() * to be called. We do this if the page is releasable by try_to_free_buffers(). * We also do it if the page has locked or dirty buffers and the caller wants @@ -1629,7 +1629,7 @@ out: * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? */ -int journal_try_to_free_buffers(journal_t *journal, +int journal_try_to_free_buffers(journal_t *journal, struct page *page, gfp_t unused_gfp_mask) { struct buffer_head *head; @@ -1697,7 +1697,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) } /* - * journal_invalidatepage + * journal_invalidatepage * * This code is tricky. It has a number of cases to deal with. * @@ -1705,15 +1705,15 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * * i_size must be updated on disk before we start calling invalidatepage on the * data. - * + * * This is done in ext3 by defining an ext3_setattr method which * updates i_size before truncate gets going. By maintaining this * invariant, we can be sure that it is safe to throw away any buffers * attached to the current transaction: once the transaction commits, * we know that the data will not be needed. - * + * * Note however that we can *not* throw away data belonging to the - * previous, committing transaction! + * previous, committing transaction! * * Any disk blocks which *are* part of the previous, committing * transaction (and which therefore cannot be discarded immediately) are @@ -1732,7 +1732,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * don't make guarantees about the order in which data hits disk --- in * particular we don't guarantee that new dirty data is flushed before * transaction commit --- so it is always safe just to discard data - * immediately in that mode. --sct + * immediately in that mode. --sct */ /* @@ -1876,9 +1876,9 @@ zap_buffer_unlocked: return may_free; } -/** +/** * void journal_invalidatepage() - * @journal: journal to use for flush... + * @journal: journal to use for flush... * @page: page to flush * @offset: length of page to invalidate. * @@ -1886,7 +1886,7 @@ zap_buffer_unlocked: * */ void journal_invalidatepage(journal_t *journal, - struct page *page, + struct page *page, unsigned long offset) { struct buffer_head *head, *bh, *next; @@ -1908,7 +1908,7 @@ void journal_invalidatepage(journal_t *journal, next = bh->b_this_page; if (offset <= curr_off) { - /* This block is wholly outside the truncation point */ + /* This block is wholly outside the truncation point */ lock_buffer(bh); may_free &= journal_unmap_buffer(journal, bh); unlock_buffer(bh); @@ -1924,8 +1924,8 @@ void journal_invalidatepage(journal_t *journal, } } -/* - * File a buffer on the given transaction list. +/* + * File a buffer on the given transaction list. */ void __journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) @@ -1948,7 +1948,7 @@ void __journal_file_buffer(struct journal_head *jh, * with __jbd_unexpected_dirty_buffer()'s handling of dirty * state. */ - if (jlist == BJ_Metadata || jlist == BJ_Reserved || + if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) @@ -2008,7 +2008,7 @@ void journal_file_buffer(struct journal_head *jh, jbd_unlock_bh_state(jh2bh(jh)); } -/* +/* * Remove a buffer from its current buffer list in preparation for * dropping it from its current transaction entirely. If the buffer has * already started to be used by a subsequent transaction, refile the @@ -2060,7 +2060,7 @@ void __journal_refile_buffer(struct journal_head *jh) * to the caller to remove the journal_head if necessary. For the * unlocked journal_refile_buffer call, the caller isn't going to be * doing anything else to the buffer so we need to do the cleanup - * ourselves to avoid a jh leak. + * ourselves to avoid a jh leak. * * *** The journal_head may be freed by this call! *** */ diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c index 93068697a9bf..f5cf9c93e243 100644 --- a/fs/jffs/inode-v23.c +++ b/fs/jffs/inode-v23.c @@ -364,12 +364,11 @@ jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode, inode->i_ctime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; - inode->i_blksize = PAGE_SIZE; inode->i_blocks = (inode->i_size + 511) >> 9; f = jffs_find_file(c, raw_inode->ino); - inode->u.generic_ip = (void *)f; + inode->i_private = (void *)f; insert_inode_hash(inode); return inode; @@ -442,7 +441,7 @@ jffs_rename(struct inode *old_dir, struct dentry *old_dentry, }); result = -ENOTDIR; - if (!(old_dir_f = (struct jffs_file *)old_dir->u.generic_ip)) { + if (!(old_dir_f = old_dir->i_private)) { D(printk("jffs_rename(): Old dir invalid.\n")); goto jffs_rename_end; } @@ -456,7 +455,7 @@ jffs_rename(struct inode *old_dir, struct dentry *old_dentry, /* Find the new directory. */ result = -ENOTDIR; - if (!(new_dir_f = (struct jffs_file *)new_dir->u.generic_ip)) { + if (!(new_dir_f = new_dir->i_private)) { D(printk("jffs_rename(): New dir invalid.\n")); goto jffs_rename_end; } @@ -593,7 +592,7 @@ jffs_readdir(struct file *filp, void *dirent, filldir_t filldir) } else { ddino = ((struct jffs_file *) - inode->u.generic_ip)->pino; + inode->i_private)->pino; } D3(printk("jffs_readdir(): \"..\" %u\n", ddino)); if (filldir(dirent, "..", 2, filp->f_pos, ddino, DT_DIR) < 0) { @@ -604,7 +603,7 @@ jffs_readdir(struct file *filp, void *dirent, filldir_t filldir) } filp->f_pos++; } - f = ((struct jffs_file *)inode->u.generic_ip)->children; + f = ((struct jffs_file *)inode->i_private)->children; j = 2; while(f && (f->deleted || j++ < filp->f_pos )) { @@ -652,7 +651,7 @@ jffs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) lock_kernel(); D3({ - char *s = (char *)kmalloc(len + 1, GFP_KERNEL); + char *s = kmalloc(len + 1, GFP_KERNEL); memcpy(s, name, len); s[len] = '\0'; printk("jffs_lookup(): dir: 0x%p, name: \"%s\"\n", dir, s); @@ -668,7 +667,7 @@ jffs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) } r = -EACCES; - if (!(d = (struct jffs_file *)dir->u.generic_ip)) { + if (!(d = (struct jffs_file *)dir->i_private)) { D(printk("jffs_lookup(): No such inode! (%lu)\n", dir->i_ino)); goto jffs_lookup_end; @@ -739,7 +738,7 @@ jffs_do_readpage_nolock(struct file *file, struct page *page) unsigned long read_len; int result; struct inode *inode = (struct inode*)page->mapping->host; - struct jffs_file *f = (struct jffs_file *)inode->u.generic_ip; + struct jffs_file *f = (struct jffs_file *)inode->i_private; struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info; int r; loff_t offset; @@ -828,7 +827,7 @@ jffs_mkdir(struct inode *dir, struct dentry *dentry, int mode) }); lock_kernel(); - dir_f = (struct jffs_file *)dir->u.generic_ip; + dir_f = dir->i_private; ASSERT(if (!dir_f) { printk(KERN_ERR "jffs_mkdir(): No reference to a " @@ -972,7 +971,7 @@ jffs_remove(struct inode *dir, struct dentry *dentry, int type) kfree(_name); }); - dir_f = (struct jffs_file *) dir->u.generic_ip; + dir_f = dir->i_private; c = dir_f->c; result = -ENOENT; @@ -1082,7 +1081,7 @@ jffs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) if (!old_valid_dev(rdev)) return -EINVAL; lock_kernel(); - dir_f = (struct jffs_file *)dir->u.generic_ip; + dir_f = dir->i_private; c = dir_f->c; D3(printk (KERN_NOTICE "mknod(): down biglock\n")); @@ -1173,8 +1172,8 @@ jffs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) lock_kernel(); D1({ int len = dentry->d_name.len; - char *_name = (char *)kmalloc(len + 1, GFP_KERNEL); - char *_symname = (char *)kmalloc(symname_len + 1, GFP_KERNEL); + char *_name = kmalloc(len + 1, GFP_KERNEL); + char *_symname = kmalloc(symname_len + 1, GFP_KERNEL); memcpy(_name, dentry->d_name.name, len); _name[len] = '\0'; memcpy(_symname, symname, symname_len); @@ -1186,7 +1185,7 @@ jffs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) kfree(_symname); }); - dir_f = (struct jffs_file *)dir->u.generic_ip; + dir_f = dir->i_private; ASSERT(if (!dir_f) { printk(KERN_ERR "jffs_symlink(): No reference to a " "jffs_file struct in inode.\n"); @@ -1282,14 +1281,14 @@ jffs_create(struct inode *dir, struct dentry *dentry, int mode, lock_kernel(); D1({ int len = dentry->d_name.len; - char *s = (char *)kmalloc(len + 1, GFP_KERNEL); + char *s = kmalloc(len + 1, GFP_KERNEL); memcpy(s, dentry->d_name.name, len); s[len] = '\0'; printk("jffs_create(): dir: 0x%p, name: \"%s\"\n", dir, s); kfree(s); }); - dir_f = (struct jffs_file *)dir->u.generic_ip; + dir_f = dir->i_private; ASSERT(if (!dir_f) { printk(KERN_ERR "jffs_create(): No reference to a " "jffs_file struct in inode.\n"); @@ -1403,9 +1402,9 @@ jffs_file_write(struct file *filp, const char *buf, size_t count, goto out_isem; } - if (!(f = (struct jffs_file *)inode->u.generic_ip)) { - D(printk("jffs_file_write(): inode->u.generic_ip = 0x%p\n", - inode->u.generic_ip)); + if (!(f = inode->i_private)) { + D(printk("jffs_file_write(): inode->i_private = 0x%p\n", + inode->i_private)); goto out_isem; } @@ -1693,7 +1692,7 @@ jffs_read_inode(struct inode *inode) mutex_unlock(&c->fmc->biglock); return; } - inode->u.generic_ip = (void *)f; + inode->i_private = f; inode->i_mode = f->mode; inode->i_nlink = f->nlink; inode->i_uid = f->uid; @@ -1706,7 +1705,6 @@ jffs_read_inode(struct inode *inode) inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; - inode->i_blksize = PAGE_SIZE; inode->i_blocks = (inode->i_size + 511) >> 9; if (S_ISREG(inode->i_mode)) { inode->i_op = &jffs_file_inode_operations; @@ -1748,7 +1746,7 @@ jffs_delete_inode(struct inode *inode) lock_kernel(); inode->i_size = 0; inode->i_blocks = 0; - inode->u.generic_ip = NULL; + inode->i_private = NULL; clear_inode(inode); if (inode->i_nlink == 0) { c = (struct jffs_control *) inode->i_sb->s_fs_info; diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c index 9000f1effedf..4a543e114970 100644 --- a/fs/jffs/intrep.c +++ b/fs/jffs/intrep.c @@ -488,13 +488,11 @@ jffs_create_file(struct jffs_control *c, { struct jffs_file *f; - if (!(f = (struct jffs_file *)kmalloc(sizeof(struct jffs_file), - GFP_KERNEL))) { + if (!(f = kzalloc(sizeof(*f), GFP_KERNEL))) { D(printk("jffs_create_file(): Failed!\n")); return NULL; } no_jffs_file++; - memset(f, 0, sizeof(struct jffs_file)); f->ino = raw_inode->ino; f->pino = raw_inode->pino; f->nlink = raw_inode->nlink; @@ -516,7 +514,7 @@ jffs_create_control(struct super_block *sb) D2(printk("jffs_create_control()\n")); - if (!(c = (struct jffs_control *)kmalloc(s, GFP_KERNEL))) { + if (!(c = kmalloc(s, GFP_KERNEL))) { goto fail_control; } DJM(no_jffs_control++); @@ -524,7 +522,7 @@ jffs_create_control(struct super_block *sb) c->gc_task = NULL; c->hash_len = JFFS_HASH_SIZE; s = sizeof(struct list_head) * c->hash_len; - if (!(c->hash = (struct list_head *)kmalloc(s, GFP_KERNEL))) { + if (!(c->hash = kmalloc(s, GFP_KERNEL))) { goto fail_hash; } DJM(no_hash++); @@ -593,8 +591,7 @@ jffs_add_virtual_root(struct jffs_control *c) D2(printk("jffs_add_virtual_root(): " "Creating a virtual root directory.\n")); - if (!(root = (struct jffs_file *)kmalloc(sizeof(struct jffs_file), - GFP_KERNEL))) { + if (!(root = kmalloc(sizeof(struct jffs_file), GFP_KERNEL))) { return -ENOMEM; } no_jffs_file++; diff --git a/fs/jffs/jffs_fm.c b/fs/jffs/jffs_fm.c index 7d8ca1aeace2..29b68d939bd9 100644 --- a/fs/jffs/jffs_fm.c +++ b/fs/jffs/jffs_fm.c @@ -94,8 +94,7 @@ jffs_build_begin(struct jffs_control *c, int unit) struct mtd_info *mtd; D3(printk("jffs_build_begin()\n")); - fmc = (struct jffs_fmcontrol *)kmalloc(sizeof(struct jffs_fmcontrol), - GFP_KERNEL); + fmc = kmalloc(sizeof(*fmc), GFP_KERNEL); if (!fmc) { D(printk("jffs_build_begin(): Allocation of " "struct jffs_fmcontrol failed!\n")); @@ -486,8 +485,7 @@ jffs_add_node(struct jffs_node *node) D3(printk("jffs_add_node(): ino = %u\n", node->ino)); - ref = (struct jffs_node_ref *)kmalloc(sizeof(struct jffs_node_ref), - GFP_KERNEL); + ref = kmalloc(sizeof(*ref), GFP_KERNEL); if (!ref) return -ENOMEM; diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 4780f82825d6..72d9909d95ff 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -263,7 +263,6 @@ void jffs2_read_inode (struct inode *inode) inode->i_nlink = f->inocache->nlink; - inode->i_blksize = PAGE_SIZE; inode->i_blocks = (inode->i_size + 511) >> 9; switch (inode->i_mode & S_IFMT) { @@ -449,7 +448,6 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime)); - inode->i_blksize = PAGE_SIZE; inode->i_blocks = 0; inode->i_size = 0; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 68e3953419b4..6de374513c01 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -119,10 +119,9 @@ static int jffs2_get_sb_mtd(struct file_system_type *fs_type, struct jffs2_sb_info *c; int ret; - c = kmalloc(sizeof(*c), GFP_KERNEL); + c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) return -ENOMEM; - memset(c, 0, sizeof(*c)); c->mtd = mtd; sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, c); diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 4d52593a5fc6..4c74f0944f7e 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -468,7 +468,7 @@ int extRecord(struct inode *ip, xad_t * xp) int extFill(struct inode *ip, xad_t * xp) { int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; - s64 blkno = offsetXAD(xp) >> ip->i_blksize; + s64 blkno = offsetXAD(xp) >> ip->i_blkbits; // assert(ISSPARSE(ip)); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index ccbe60aff83d..369d7f39c040 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -3115,7 +3115,6 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip) ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec); ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec); ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec); - ip->i_blksize = ip->i_sb->s_blocksize; ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks)); ip->i_generation = le32_to_cpu(dip->di_gen); diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 495df402916d..bffaca9ae3a2 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -115,7 +115,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode) } jfs_inode->mode2 |= mode; - inode->i_blksize = sb->s_blocksize; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; jfs_inode->otime = inode->i_ctime.tv_sec; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index e1e0a6e6ebdf..f5afc129d6b1 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -257,7 +257,7 @@ static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock, int rc = 0; int xflag; s64 xaddr; - sector_t file_blocks = (inode->i_size + inode->i_blksize - 1) >> + sector_t file_blocks = (inode->i_size + inode->i_sb->s_blocksize - 1) >> inode->i_blkbits; if (lblock >= file_blocks) diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index efbb586bed4b..3856efc399c1 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -282,7 +282,7 @@ int txInit(void) TxLockVHWM = (nTxLock * 8) / 10; size = sizeof(struct tblock) * nTxBlock; - TxBlock = (struct tblock *) vmalloc(size); + TxBlock = vmalloc(size); if (TxBlock == NULL) return -ENOMEM; @@ -307,7 +307,7 @@ int txInit(void) * tlock id = 0 is reserved. */ size = sizeof(struct tlock) * nTxLock; - TxLock = (struct tlock *) vmalloc(size); + TxLock = vmalloc(size); if (TxLock == NULL) { vfree(TxBlock); return -ENOMEM; diff --git a/fs/libfs.c b/fs/libfs.c index ac02ea602c3d..3793aaa14577 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -317,17 +317,9 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry, int simple_readpage(struct file *file, struct page *page) { - void *kaddr; - - if (PageUptodate(page)) - goto out; - - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr, 0, PAGE_CACHE_SIZE); - kunmap_atomic(kaddr, KM_USER0); + clear_highpage(page); flush_dcache_page(page); SetPageUptodate(page); -out: unlock_page(page); return 0; } @@ -383,7 +375,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files return -ENOMEM; inode->i_mode = S_IFDIR | 0755; inode->i_uid = inode->i_gid = 0; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_op = &simple_dir_inode_operations; @@ -405,7 +396,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files goto out; inode->i_mode = S_IFREG | files->mode; inode->i_uid = inode->i_gid = 0; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_fop = files->ops; @@ -547,7 +537,7 @@ int simple_attr_open(struct inode *inode, struct file *file, attr->get = get; attr->set = set; - attr->data = inode->u.generic_ip; + attr->data = inode->i_private; attr->fmt = fmt; mutex_init(&attr->mutex); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 52774feab93f..f95cc3f3c42d 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -160,7 +160,7 @@ static void nlmclnt_prepare_reclaim(struct nlm_host *host) */ list_splice_init(&host->h_granted, &host->h_reclaim); - dprintk("NLM: reclaiming locks for host %s", host->h_name); + dprintk("NLM: reclaiming locks for host %s\n", host->h_name); } static void nlmclnt_finish_reclaim(struct nlm_host *host) diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 89ba0df14c22..271e2165fff6 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -100,7 +100,7 @@ static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_ res = __nlm_find_lockowner(host, owner); if (res == NULL) { spin_unlock(&host->h_lock); - new = (struct nlm_lockowner *)kmalloc(sizeof(*new), GFP_KERNEL); + new = kmalloc(sizeof(*new), GFP_KERNEL); spin_lock(&host->h_lock); res = __nlm_find_lockowner(host, owner); if (res == NULL && new != NULL) { @@ -151,11 +151,13 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req) int nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl) { + struct rpc_clnt *client = NFS_CLIENT(inode); + struct sockaddr_in addr; struct nlm_host *host; struct nlm_rqst *call; sigset_t oldset; unsigned long flags; - int status, proto, vers; + int status, vers; vers = (NFS_PROTO(inode)->version == 3) ? 4 : 1; if (NFS_PROTO(inode)->version > 3) { @@ -163,10 +165,8 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl) return -ENOLCK; } - /* Retrieve transport protocol from NFS client */ - proto = NFS_CLIENT(inode)->cl_xprt->prot; - - host = nlmclnt_lookup_host(NFS_ADDR(inode), proto, vers); + rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr)); + host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers); if (host == NULL) return -ENOLCK; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 38b0e8a1aec0..a0d0b58ce7a4 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -26,7 +26,6 @@ #define NLM_HOST_REBIND (60 * HZ) #define NLM_HOST_EXPIRE ((nrhosts > NLM_HOST_MAX)? 300 * HZ : 120 * HZ) #define NLM_HOST_COLLECT ((nrhosts > NLM_HOST_MAX)? 120 * HZ : 60 * HZ) -#define NLM_HOST_ADDR(sv) (&(sv)->s_nlmclnt->cl_xprt->addr) static struct nlm_host * nlm_hosts[NLM_HOST_NRHASH]; static unsigned long next_gc; @@ -100,9 +99,9 @@ nlm_lookup_host(int server, struct sockaddr_in *sin, /* Ooops, no host found, create it */ dprintk("lockd: creating host entry\n"); - if (!(host = (struct nlm_host *) kmalloc(sizeof(*host), GFP_KERNEL))) + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) goto nohost; - memset(host, 0, sizeof(*host)); addr = sin->sin_addr.s_addr; sprintf(host->h_name, "%u.%u.%u.%u", NIPQUAD(addr)); @@ -167,7 +166,6 @@ struct rpc_clnt * nlm_bind_host(struct nlm_host *host) { struct rpc_clnt *clnt; - struct rpc_xprt *xprt; dprintk("lockd: nlm_bind_host(%08x)\n", (unsigned)ntohl(host->h_addr.sin_addr.s_addr)); @@ -179,7 +177,6 @@ nlm_bind_host(struct nlm_host *host) * RPC rebind is required */ if ((clnt = host->h_rpcclnt) != NULL) { - xprt = clnt->cl_xprt; if (time_after_eq(jiffies, host->h_nextrebind)) { rpc_force_rebind(clnt); host->h_nextrebind = jiffies + NLM_HOST_REBIND; @@ -187,31 +184,37 @@ nlm_bind_host(struct nlm_host *host) host->h_nextrebind - jiffies); } } else { - xprt = xprt_create_proto(host->h_proto, &host->h_addr, NULL); - if (IS_ERR(xprt)) - goto forgetit; - - xprt_set_timeout(&xprt->timeout, 5, nlmsvc_timeout); - xprt->resvport = 1; /* NLM requires a reserved port */ - - /* Existing NLM servers accept AUTH_UNIX only */ - clnt = rpc_new_client(xprt, host->h_name, &nlm_program, - host->h_version, RPC_AUTH_UNIX); - if (IS_ERR(clnt)) - goto forgetit; - clnt->cl_autobind = 1; /* turn on pmap queries */ - clnt->cl_softrtry = 1; /* All queries are soft */ - - host->h_rpcclnt = clnt; + unsigned long increment = nlmsvc_timeout * HZ; + struct rpc_timeout timeparms = { + .to_initval = increment, + .to_increment = increment, + .to_maxval = increment * 6UL, + .to_retries = 5U, + }; + struct rpc_create_args args = { + .protocol = host->h_proto, + .address = (struct sockaddr *)&host->h_addr, + .addrsize = sizeof(host->h_addr), + .timeout = &timeparms, + .servername = host->h_name, + .program = &nlm_program, + .version = host->h_version, + .authflavor = RPC_AUTH_UNIX, + .flags = (RPC_CLNT_CREATE_HARDRTRY | + RPC_CLNT_CREATE_AUTOBIND), + }; + + clnt = rpc_create(&args); + if (!IS_ERR(clnt)) + host->h_rpcclnt = clnt; + else { + printk("lockd: couldn't create RPC handle for %s\n", host->h_name); + clnt = NULL; + } } mutex_unlock(&host->h_mutex); return clnt; - -forgetit: - printk("lockd: couldn't create RPC handle for %s\n", host->h_name); - mutex_unlock(&host->h_mutex); - return NULL; } /* diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 3fc683f46b3e..5954dcb497e4 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -109,30 +109,23 @@ nsm_unmonitor(struct nlm_host *host) static struct rpc_clnt * nsm_create(void) { - struct rpc_xprt *xprt; - struct rpc_clnt *clnt; - struct sockaddr_in sin; - - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - sin.sin_port = 0; - - xprt = xprt_create_proto(IPPROTO_UDP, &sin, NULL); - if (IS_ERR(xprt)) - return (struct rpc_clnt *)xprt; - xprt->resvport = 1; /* NSM requires a reserved port */ - - clnt = rpc_create_client(xprt, "localhost", - &nsm_program, SM_VERSION, - RPC_AUTH_NULL); - if (IS_ERR(clnt)) - goto out_err; - clnt->cl_softrtry = 1; - clnt->cl_oneshot = 1; - return clnt; - -out_err: - return clnt; + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + .sin_port = 0, + }; + struct rpc_create_args args = { + .protocol = IPPROTO_UDP, + .address = (struct sockaddr *)&sin, + .addrsize = sizeof(sin), + .servername = "localhost", + .program = &nsm_program, + .version = SM_VERSION, + .authflavor = RPC_AUTH_NULL, + .flags = (RPC_CLNT_CREATE_ONESHOT), + }; + + return rpc_create(&args); } /* diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 01b4db9e5466..a92dd98f8401 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -100,11 +100,10 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, nlm_debug_print_fh("creating file for", f); nfserr = nlm_lck_denied_nolocks; - file = (struct nlm_file *) kmalloc(sizeof(*file), GFP_KERNEL); + file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) goto out_unlock; - memset(file, 0, sizeof(*file)); memcpy(&file->f_handle, f, sizeof(struct nfs_fh)); file->f_hash = hash; init_MUTEX(&file->f_sema); diff --git a/fs/mbcache.c b/fs/mbcache.c index e4fde1ab22cd..0ff71256e65b 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -160,6 +160,7 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) static void __mb_cache_entry_release_unlock(struct mb_cache_entry *ce) + __releases(mb_cache_spinlock) { /* Wake up all processes queuing for this cache entry. */ if (ce->e_queued) diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 4a6abc49418e..df6b1075b549 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -254,7 +254,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error) inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid; inode->i_ino = j; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; - inode->i_blocks = inode->i_blksize = 0; + inode->i_blocks = 0; memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u)); insert_inode_hash(inode); mark_inode_dirty(inode); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 330ff9fc7cf0..c11a4b9fb863 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -90,8 +90,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(minix_inode_cachep)) - printk(KERN_INFO "minix_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(minix_inode_cachep); } static struct super_operations minix_sops = { @@ -145,11 +144,10 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) struct inode *root_inode; struct minix_sb_info *sbi; - sbi = kmalloc(sizeof(struct minix_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(struct minix_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; s->s_fs_info = sbi; - memset(sbi, 0, sizeof(struct minix_sb_info)); /* N.B. These should be compile-time tests. Unfortunately that is impossible. */ @@ -207,10 +205,9 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0) goto out_illegal_sb; i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh); - map = kmalloc(i, GFP_KERNEL); + map = kzalloc(i, GFP_KERNEL); if (!map) goto out_no_map; - memset(map, 0, i); sbi->s_imap = &map[0]; sbi->s_zmap = &map[sbi->s_imap_blocks]; @@ -399,7 +396,7 @@ static void V1_minix_read_inode(struct inode * inode) inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; - inode->i_blocks = inode->i_blksize = 0; + inode->i_blocks = 0; for (i = 0; i < 9; i++) minix_inode->u.i1_data[i] = raw_inode->i_zone[i]; minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0])); @@ -432,7 +429,7 @@ static void V2_minix_read_inode(struct inode * inode) inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; - inode->i_blocks = inode->i_blksize = 0; + inode->i_blocks = 0; for (i = 0; i < 10; i++) minix_inode->u.i2_data[i] = raw_inode->i_zone[i]; minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0])); diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c index 9e44158a7540..d220165d4918 100644 --- a/fs/msdos/namei.c +++ b/fs/msdos/namei.c @@ -280,7 +280,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) { struct super_block *sb = dir->i_sb; - struct inode *inode; + struct inode *inode = NULL; struct fat_slot_info sinfo; struct timespec ts; unsigned char msdos_name[MSDOS_NAME]; @@ -316,6 +316,8 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode, d_instantiate(dentry, inode); out: unlock_kernel(); + if (!err) + err = fat_flush_inodes(sb, dir, inode); return err; } @@ -348,6 +350,8 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) fat_detach(inode); out: unlock_kernel(); + if (!err) + err = fat_flush_inodes(inode->i_sb, dir, inode); return err; } @@ -401,6 +405,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode) d_instantiate(dentry, inode); unlock_kernel(); + fat_flush_inodes(sb, dir, inode); return 0; out_free: @@ -430,6 +435,8 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry) fat_detach(inode); out: unlock_kernel(); + if (!err) + err = fat_flush_inodes(inode->i_sb, dir, inode); return err; } @@ -635,6 +642,8 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry, new_dir, new_msdos_name, new_dentry, is_hid); out: unlock_kernel(); + if (!err) + err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir); return err; } diff --git a/fs/namei.c b/fs/namei.c index 432d6bc6fab0..2892e68d3a86 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -372,6 +372,30 @@ void release_open_intent(struct nameidata *nd) fput(nd->intent.open.file); } +static inline struct dentry * +do_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + int status = dentry->d_op->d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + /* + * The dentry failed validation. + * If d_revalidate returned 0 attempt to invalidate + * the dentry otherwise d_revalidate is asking us + * to return a fail status. + */ + if (!status) { + if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + } + } else { + dput(dentry); + dentry = ERR_PTR(status); + } + } + return dentry; +} + /* * Internal lookup() using the new generic dcache. * SMP-safe @@ -386,12 +410,9 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, if (!dentry) dentry = d_lookup(parent, name); - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - if (!dentry->d_op->d_revalidate(dentry, nd) && !d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } - } + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) + dentry = do_revalidate(dentry, nd); + return dentry; } @@ -484,10 +505,9 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s */ mutex_unlock(&dir->i_mutex); if (result->d_op && result->d_op->d_revalidate) { - if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) { - dput(result); + result = do_revalidate(result, nd); + if (!result) result = ERR_PTR(-ENOENT); - } } return result; } @@ -498,18 +518,20 @@ static int __emul_lookup_dentry(const char *, struct nameidata *); static __always_inline int walk_init_root(const char *name, struct nameidata *nd) { - read_lock(¤t->fs->lock); - if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) { - nd->mnt = mntget(current->fs->altrootmnt); - nd->dentry = dget(current->fs->altroot); - read_unlock(¤t->fs->lock); + struct fs_struct *fs = current->fs; + + read_lock(&fs->lock); + if (fs->altroot && !(nd->flags & LOOKUP_NOALT)) { + nd->mnt = mntget(fs->altrootmnt); + nd->dentry = dget(fs->altroot); + read_unlock(&fs->lock); if (__emul_lookup_dentry(name,nd)) return 0; - read_lock(¤t->fs->lock); + read_lock(&fs->lock); } - nd->mnt = mntget(current->fs->rootmnt); - nd->dentry = dget(current->fs->root); - read_unlock(¤t->fs->lock); + nd->mnt = mntget(fs->rootmnt); + nd->dentry = dget(fs->root); + read_unlock(&fs->lock); return 1; } @@ -704,17 +726,19 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry) static __always_inline void follow_dotdot(struct nameidata *nd) { + struct fs_struct *fs = current->fs; + while(1) { struct vfsmount *parent; struct dentry *old = nd->dentry; - read_lock(¤t->fs->lock); - if (nd->dentry == current->fs->root && - nd->mnt == current->fs->rootmnt) { - read_unlock(¤t->fs->lock); + read_lock(&fs->lock); + if (nd->dentry == fs->root && + nd->mnt == fs->rootmnt) { + read_unlock(&fs->lock); break; } - read_unlock(¤t->fs->lock); + read_unlock(&fs->lock); spin_lock(&dcache_lock); if (nd->dentry != nd->mnt->mnt_root) { nd->dentry = dget(nd->dentry->d_parent); @@ -767,12 +791,12 @@ need_lookup: goto done; need_revalidate: - if (dentry->d_op->d_revalidate(dentry, nd)) - goto done; - if (d_invalidate(dentry)) - goto done; - dput(dentry); - goto need_lookup; + dentry = do_revalidate(dentry, nd); + if (!dentry) + goto need_lookup; + if (IS_ERR(dentry)) + goto fail; + goto done; fail: return PTR_ERR(dentry); @@ -1022,15 +1046,17 @@ static int __emul_lookup_dentry(const char *name, struct nameidata *nd) struct vfsmount *old_mnt = nd->mnt; struct qstr last = nd->last; int last_type = nd->last_type; + struct fs_struct *fs = current->fs; + /* - * NAME was not found in alternate root or it's a directory. Try to find - * it in the normal root: + * NAME was not found in alternate root or it's a directory. + * Try to find it in the normal root: */ nd->last_type = LAST_ROOT; - read_lock(¤t->fs->lock); - nd->mnt = mntget(current->fs->rootmnt); - nd->dentry = dget(current->fs->root); - read_unlock(¤t->fs->lock); + read_lock(&fs->lock); + nd->mnt = mntget(fs->rootmnt); + nd->dentry = dget(fs->root); + read_unlock(&fs->lock); if (path_walk(name, nd) == 0) { if (nd->dentry->d_inode) { dput(old_dentry); @@ -1054,6 +1080,7 @@ void set_fs_altroot(void) struct vfsmount *mnt = NULL, *oldmnt; struct dentry *dentry = NULL, *olddentry; int err; + struct fs_struct *fs = current->fs; if (!emul) goto set_it; @@ -1063,12 +1090,12 @@ void set_fs_altroot(void) dentry = nd.dentry; } set_it: - write_lock(¤t->fs->lock); - oldmnt = current->fs->altrootmnt; - olddentry = current->fs->altroot; - current->fs->altrootmnt = mnt; - current->fs->altroot = dentry; - write_unlock(¤t->fs->lock); + write_lock(&fs->lock); + oldmnt = fs->altrootmnt; + olddentry = fs->altroot; + fs->altrootmnt = mnt; + fs->altroot = dentry; + write_unlock(&fs->lock); if (olddentry) { dput(olddentry); mntput(oldmnt); @@ -1082,29 +1109,30 @@ static int fastcall do_path_lookup(int dfd, const char *name, int retval = 0; int fput_needed; struct file *file; + struct fs_struct *fs = current->fs; nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags; nd->depth = 0; if (*name=='/') { - read_lock(¤t->fs->lock); - if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) { - nd->mnt = mntget(current->fs->altrootmnt); - nd->dentry = dget(current->fs->altroot); - read_unlock(¤t->fs->lock); + read_lock(&fs->lock); + if (fs->altroot && !(nd->flags & LOOKUP_NOALT)) { + nd->mnt = mntget(fs->altrootmnt); + nd->dentry = dget(fs->altroot); + read_unlock(&fs->lock); if (__emul_lookup_dentry(name,nd)) goto out; /* found in altroot */ - read_lock(¤t->fs->lock); + read_lock(&fs->lock); } - nd->mnt = mntget(current->fs->rootmnt); - nd->dentry = dget(current->fs->root); - read_unlock(¤t->fs->lock); + nd->mnt = mntget(fs->rootmnt); + nd->dentry = dget(fs->root); + read_unlock(&fs->lock); } else if (dfd == AT_FDCWD) { - read_lock(¤t->fs->lock); - nd->mnt = mntget(current->fs->pwdmnt); - nd->dentry = dget(current->fs->pwd); - read_unlock(¤t->fs->lock); + read_lock(&fs->lock); + nd->mnt = mntget(fs->pwdmnt); + nd->dentry = dget(fs->pwd); + read_unlock(&fs->lock); } else { struct dentry *dentry; @@ -2370,7 +2398,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, dput(new_dentry); } if (!error) - d_move(old_dentry,new_dentry); + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) + d_move(old_dentry,new_dentry); return error; } @@ -2393,8 +2422,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); if (!error) { - /* The following d_move() should become unconditional */ - if (!(old_dir->i_sb->s_type->fs_flags & FS_ODD_RENAME)) + if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) d_move(old_dentry, new_dentry); } if (target) diff --git a/fs/namespace.c b/fs/namespace.c index fa7ed6a9fc2d..6ede3a539ed8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -13,10 +13,12 @@ #include <linux/sched.h> #include <linux/smp_lock.h> #include <linux/init.h> +#include <linux/kernel.h> #include <linux/quotaops.h> #include <linux/acct.h> #include <linux/capability.h> #include <linux/module.h> +#include <linux/sysfs.h> #include <linux/seq_file.h> #include <linux/namespace.h> #include <linux/namei.h> @@ -28,15 +30,6 @@ extern int __init init_rootfs(void); -#ifdef CONFIG_SYSFS -extern int __init sysfs_init(void); -#else -static inline int sysfs_init(void) -{ - return 0; -} -#endif - /* spinlock for vfsmount related operations, inplace of dcache_lock */ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); @@ -1821,6 +1814,7 @@ void __init mnt_init(unsigned long mempages) struct list_head *d; unsigned int nr_hash; int i; + int err; init_rwsem(&namespace_sem); @@ -1861,8 +1855,14 @@ void __init mnt_init(unsigned long mempages) d++; i--; } while (i); - sysfs_init(); - subsystem_register(&fs_subsys); + err = sysfs_init(); + if (err) + printk(KERN_WARNING "%s: sysfs_init error: %d\n", + __FUNCTION__, err); + err = subsystem_register(&fs_subsys); + if (err) + printk(KERN_WARNING "%s: subsystem_register error: %d\n", + __FUNCTION__, err); init_rootfs(); init_mount_tree(); } diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 1ddf77b0b825..42e3bef270c9 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -81,8 +81,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(ncp_inode_cachep)) - printk(KERN_INFO "ncp_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(ncp_inode_cachep); } static int ncp_remount(struct super_block *sb, int *flags, char* data) @@ -224,7 +223,6 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) inode->i_nlink = 1; inode->i_uid = server->m.uid; inode->i_gid = server->m.gid; - inode->i_blksize = NCP_BLOCK_SIZE; ncp_update_dates(inode, &nwinfo->i); ncp_update_inode(inode, nwinfo); @@ -411,11 +409,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) #endif struct ncp_entry_info finfo; - server = kmalloc(sizeof(struct ncp_server), GFP_KERNEL); + server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL); if (!server) return -ENOMEM; sb->s_fs_info = server; - memset(server, 0, sizeof(struct ncp_server)); error = -EFAULT; if (raw_data == NULL) diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c index ca92c2406635..e3d26c1bd105 100644 --- a/fs/ncpfs/symlink.c +++ b/fs/ncpfs/symlink.c @@ -48,7 +48,7 @@ static int ncp_symlink_readpage(struct file *file, struct page *page) char *buf = kmap(page); error = -ENOMEM; - rawlink=(char *)kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL); + rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL); if (!rawlink) goto fail; @@ -126,7 +126,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { /* EPERM is returned by VFS if symlink procedure does not exist */ return -EPERM; - rawlink=(char *)kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL); + rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL); if (!rawlink) return -ENOMEM; diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 0b572a0c1967..f4580b44eef4 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -4,9 +4,9 @@ obj-$(CONFIG_NFS_FS) += nfs.o -nfs-y := dir.o file.o inode.o super.o nfs2xdr.o pagelist.o \ - proc.o read.o symlink.o unlink.o write.o \ - namespace.o +nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ + pagelist.o proc.o read.o symlink.o unlink.o \ + write.o namespace.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index fe0a6b8ac149..a3ee11364db0 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -19,6 +19,7 @@ #include "nfs4_fs.h" #include "callback.h" +#include "internal.h" #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -36,6 +37,21 @@ static struct svc_program nfs4_callback_program; unsigned int nfs_callback_set_tcpport; unsigned short nfs_callback_tcpport; +static const int nfs_set_port_min = 0; +static const int nfs_set_port_max = 65535; + +static int param_set_port(const char *val, struct kernel_param *kp) +{ + char *endp; + int num = simple_strtol(val, &endp, 0); + if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) + return -EINVAL; + *((int *)kp->arg) = num; + return 0; +} + +module_param_call(callback_tcpport, param_set_port, param_get_int, + &nfs_callback_set_tcpport, 0644); /* * This is the callback kernel thread. @@ -134,10 +150,8 @@ out_err: /* * Kill the server process if it is not already up. */ -int nfs_callback_down(void) +void nfs_callback_down(void) { - int ret = 0; - lock_kernel(); mutex_lock(&nfs_callback_mutex); nfs_callback_info.users--; @@ -149,20 +163,19 @@ int nfs_callback_down(void) } while (wait_for_completion_timeout(&nfs_callback_info.stopped, 5*HZ) == 0); mutex_unlock(&nfs_callback_mutex); unlock_kernel(); - return ret; } static int nfs_callback_authenticate(struct svc_rqst *rqstp) { - struct in_addr *addr = &rqstp->rq_addr.sin_addr; - struct nfs4_client *clp; + struct sockaddr_in *addr = &rqstp->rq_addr; + struct nfs_client *clp; /* Don't talk to strangers */ - clp = nfs4_find_client(addr); + clp = nfs_find_client(addr, 4); if (clp == NULL) return SVC_DROP; - dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr)); - nfs4_put_client(clp); + dprintk("%s: %u.%u.%u.%u NFSv4 callback!\n", __FUNCTION__, NIPQUAD(addr->sin_addr)); + nfs_put_client(clp); switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: if (rqstp->rq_proc != CB_NULL) diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index b252e7fe53a5..5676163d26e8 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -62,8 +62,13 @@ struct cb_recallargs { extern unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy); +#ifdef CONFIG_NFS_V4 extern int nfs_callback_up(void); -extern int nfs_callback_down(void); +extern void nfs_callback_down(void); +#else +#define nfs_callback_up() (0) +#define nfs_callback_down() do {} while(0) +#endif extern unsigned int nfs_callback_set_tcpport; extern unsigned short nfs_callback_tcpport; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 7719483ecdfc..97cf8f71451f 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -10,19 +10,20 @@ #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" +#include "internal.h" #define NFSDBG_FACILITY NFSDBG_CALLBACK unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res) { - struct nfs4_client *clp; + struct nfs_client *clp; struct nfs_delegation *delegation; struct nfs_inode *nfsi; struct inode *inode; res->bitmap[0] = res->bitmap[1] = 0; res->status = htonl(NFS4ERR_BADHANDLE); - clp = nfs4_find_client(&args->addr->sin_addr); + clp = nfs_find_client(args->addr, 4); if (clp == NULL) goto out; inode = nfs_delegation_find_inode(clp, &args->fh); @@ -48,7 +49,7 @@ out_iput: up_read(&nfsi->rwsem); iput(inode); out_putclient: - nfs4_put_client(clp); + nfs_put_client(clp); out: dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res->status)); return res->status; @@ -56,12 +57,12 @@ out: unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy) { - struct nfs4_client *clp; + struct nfs_client *clp; struct inode *inode; unsigned res; res = htonl(NFS4ERR_BADHANDLE); - clp = nfs4_find_client(&args->addr->sin_addr); + clp = nfs_find_client(args->addr, 4); if (clp == NULL) goto out; inode = nfs_delegation_find_inode(clp, &args->fh); @@ -80,7 +81,7 @@ unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy) } iput(inode); out_putclient: - nfs4_put_client(clp); + nfs_put_client(clp); out: dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res)); return res; diff --git a/fs/nfs/client.c b/fs/nfs/client.c new file mode 100644 index 000000000000..ec1938d4b814 --- /dev/null +++ b/fs/nfs/client.c @@ -0,0 +1,1448 @@ +/* client.c: NFS client sharing and management code + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> + +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/unistd.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/stats.h> +#include <linux/sunrpc/metrics.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/nfs4_mount.h> +#include <linux/lockd/bind.h> +#include <linux/smp_lock.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include <linux/nfs_idmap.h> +#include <linux/vfs.h> +#include <linux/inet.h> +#include <linux/nfs_xdr.h> + +#include <asm/system.h> + +#include "nfs4_fs.h" +#include "callback.h" +#include "delegation.h" +#include "iostat.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT + +static DEFINE_SPINLOCK(nfs_client_lock); +static LIST_HEAD(nfs_client_list); +static LIST_HEAD(nfs_volume_list); +static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); + +/* + * RPC cruft for NFS + */ +static struct rpc_version *nfs_version[5] = { + [2] = &nfs_version2, +#ifdef CONFIG_NFS_V3 + [3] = &nfs_version3, +#endif +#ifdef CONFIG_NFS_V4 + [4] = &nfs_version4, +#endif +}; + +struct rpc_program nfs_program = { + .name = "nfs", + .number = NFS_PROGRAM, + .nrvers = ARRAY_SIZE(nfs_version), + .version = nfs_version, + .stats = &nfs_rpcstat, + .pipe_dir_name = "/nfs", +}; + +struct rpc_stat nfs_rpcstat = { + .program = &nfs_program +}; + + +#ifdef CONFIG_NFS_V3_ACL +static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; +static struct rpc_version * nfsacl_version[] = { + [3] = &nfsacl_version3, +}; + +struct rpc_program nfsacl_program = { + .name = "nfsacl", + .number = NFS_ACL_PROGRAM, + .nrvers = ARRAY_SIZE(nfsacl_version), + .version = nfsacl_version, + .stats = &nfsacl_rpcstat, +}; +#endif /* CONFIG_NFS_V3_ACL */ + +/* + * Allocate a shared client record + * + * Since these are allocated/deallocated very rarely, we don't + * bother putting them in a slab cache... + */ +static struct nfs_client *nfs_alloc_client(const char *hostname, + const struct sockaddr_in *addr, + int nfsversion) +{ + struct nfs_client *clp; + int error; + + if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) + goto error_0; + + error = rpciod_up(); + if (error < 0) { + dprintk("%s: couldn't start rpciod! Error = %d\n", + __FUNCTION__, error); + goto error_1; + } + __set_bit(NFS_CS_RPCIOD, &clp->cl_res_state); + + if (nfsversion == 4) { + if (nfs_callback_up() < 0) + goto error_2; + __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); + } + + atomic_set(&clp->cl_count, 1); + clp->cl_cons_state = NFS_CS_INITING; + + clp->cl_nfsversion = nfsversion; + memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr)); + + if (hostname) { + clp->cl_hostname = kstrdup(hostname, GFP_KERNEL); + if (!clp->cl_hostname) + goto error_3; + } + + INIT_LIST_HEAD(&clp->cl_superblocks); + clp->cl_rpcclient = ERR_PTR(-EINVAL); + +#ifdef CONFIG_NFS_V4 + init_rwsem(&clp->cl_sem); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_state_owners); + INIT_LIST_HEAD(&clp->cl_unused); + spin_lock_init(&clp->cl_lock); + INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp); + rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client"); + clp->cl_boot_time = CURRENT_TIME; + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; +#endif + + return clp; + +error_3: + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) + nfs_callback_down(); +error_2: + rpciod_down(); + __clear_bit(NFS_CS_RPCIOD, &clp->cl_res_state); +error_1: + kfree(clp); +error_0: + return NULL; +} + +static void nfs4_shutdown_client(struct nfs_client *clp) +{ +#ifdef CONFIG_NFS_V4 + if (__test_and_clear_bit(NFS_CS_RENEWD, &clp->cl_res_state)) + nfs4_kill_renewd(clp); + while (!list_empty(&clp->cl_unused)) { + struct nfs4_state_owner *sp; + + sp = list_entry(clp->cl_unused.next, + struct nfs4_state_owner, + so_list); + list_del(&sp->so_list); + kfree(sp); + } + BUG_ON(!list_empty(&clp->cl_state_owners)); + if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) + nfs_idmap_delete(clp); +#endif +} + +/* + * Destroy a shared client record + */ +static void nfs_free_client(struct nfs_client *clp) +{ + dprintk("--> nfs_free_client(%d)\n", clp->cl_nfsversion); + + nfs4_shutdown_client(clp); + + /* -EIO all pending I/O */ + if (!IS_ERR(clp->cl_rpcclient)) + rpc_shutdown_client(clp->cl_rpcclient); + + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) + nfs_callback_down(); + + if (__test_and_clear_bit(NFS_CS_RPCIOD, &clp->cl_res_state)) + rpciod_down(); + + kfree(clp->cl_hostname); + kfree(clp); + + dprintk("<-- nfs_free_client()\n"); +} + +/* + * Release a reference to a shared client record + */ +void nfs_put_client(struct nfs_client *clp) +{ + if (!clp) + return; + + dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); + + if (atomic_dec_and_lock(&clp->cl_count, &nfs_client_lock)) { + list_del(&clp->cl_share_link); + spin_unlock(&nfs_client_lock); + + BUG_ON(!list_empty(&clp->cl_superblocks)); + + nfs_free_client(clp); + } +} + +/* + * Find a client by address + * - caller must hold nfs_client_lock + */ +static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion) +{ + struct nfs_client *clp; + + list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + /* Different NFS versions cannot share the same nfs_client */ + if (clp->cl_nfsversion != nfsversion) + continue; + + if (memcmp(&clp->cl_addr.sin_addr, &addr->sin_addr, + sizeof(clp->cl_addr.sin_addr)) != 0) + continue; + + if (clp->cl_addr.sin_port == addr->sin_port) + goto found; + } + + return NULL; + +found: + atomic_inc(&clp->cl_count); + return clp; +} + +/* + * Find a client by IP address and protocol version + * - returns NULL if no such client + */ +struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversion) +{ + struct nfs_client *clp; + + spin_lock(&nfs_client_lock); + clp = __nfs_find_client(addr, nfsversion); + spin_unlock(&nfs_client_lock); + + BUG_ON(clp && clp->cl_cons_state == 0); + + return clp; +} + +/* + * Look up a client by IP address and protocol version + * - creates a new record if one doesn't yet exist + */ +static struct nfs_client *nfs_get_client(const char *hostname, + const struct sockaddr_in *addr, + int nfsversion) +{ + struct nfs_client *clp, *new = NULL; + int error; + + dprintk("--> nfs_get_client(%s,"NIPQUAD_FMT":%d,%d)\n", + hostname ?: "", NIPQUAD(addr->sin_addr), + addr->sin_port, nfsversion); + + /* see if the client already exists */ + do { + spin_lock(&nfs_client_lock); + + clp = __nfs_find_client(addr, nfsversion); + if (clp) + goto found_client; + if (new) + goto install_client; + + spin_unlock(&nfs_client_lock); + + new = nfs_alloc_client(hostname, addr, nfsversion); + } while (new); + + return ERR_PTR(-ENOMEM); + + /* install a new client and return with it unready */ +install_client: + clp = new; + list_add(&clp->cl_share_link, &nfs_client_list); + spin_unlock(&nfs_client_lock); + dprintk("--> nfs_get_client() = %p [new]\n", clp); + return clp; + + /* found an existing client + * - make sure it's ready before returning + */ +found_client: + spin_unlock(&nfs_client_lock); + + if (new) + nfs_free_client(new); + + if (clp->cl_cons_state == NFS_CS_INITING) { + DECLARE_WAITQUEUE(myself, current); + + add_wait_queue(&nfs_client_active_wq, &myself); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (signal_pending(current) || + clp->cl_cons_state > NFS_CS_READY) + break; + schedule(); + } + + remove_wait_queue(&nfs_client_active_wq, &myself); + + if (signal_pending(current)) { + nfs_put_client(clp); + return ERR_PTR(-ERESTARTSYS); + } + } + + if (clp->cl_cons_state < NFS_CS_READY) { + error = clp->cl_cons_state; + nfs_put_client(clp); + return ERR_PTR(error); + } + + BUG_ON(clp->cl_cons_state != NFS_CS_READY); + + dprintk("--> nfs_get_client() = %p [share]\n", clp); + return clp; +} + +/* + * Mark a server as ready or failed + */ +static void nfs_mark_client_ready(struct nfs_client *clp, int state) +{ + clp->cl_cons_state = state; + wake_up_all(&nfs_client_active_wq); +} + +/* + * Initialise the timeout values for a connection + */ +static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, + unsigned int timeo, unsigned int retrans) +{ + to->to_initval = timeo * HZ / 10; + to->to_retries = retrans; + if (!to->to_retries) + to->to_retries = 2; + + switch (proto) { + case IPPROTO_TCP: + if (!to->to_initval) + to->to_initval = 60 * HZ; + if (to->to_initval > NFS_MAX_TCP_TIMEOUT) + to->to_initval = NFS_MAX_TCP_TIMEOUT; + to->to_increment = to->to_initval; + to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); + to->to_exponential = 0; + break; + case IPPROTO_UDP: + default: + if (!to->to_initval) + to->to_initval = 11 * HZ / 10; + if (to->to_initval > NFS_MAX_UDP_TIMEOUT) + to->to_initval = NFS_MAX_UDP_TIMEOUT; + to->to_maxval = NFS_MAX_UDP_TIMEOUT; + to->to_exponential = 1; + break; + } +} + +/* + * Create an RPC client handle + */ +static int nfs_create_rpc_client(struct nfs_client *clp, int proto, + unsigned int timeo, + unsigned int retrans, + rpc_authflavor_t flavor) +{ + struct rpc_timeout timeparms; + struct rpc_clnt *clnt = NULL; + struct rpc_create_args args = { + .protocol = proto, + .address = (struct sockaddr *)&clp->cl_addr, + .addrsize = sizeof(clp->cl_addr), + .timeout = &timeparms, + .servername = clp->cl_hostname, + .program = &nfs_program, + .version = clp->rpc_ops->version, + .authflavor = flavor, + }; + + if (!IS_ERR(clp->cl_rpcclient)) + return 0; + + nfs_init_timeout_values(&timeparms, proto, timeo, retrans); + clp->retrans_timeo = timeparms.to_initval; + clp->retrans_count = timeparms.to_retries; + + clnt = rpc_create(&args); + if (IS_ERR(clnt)) { + dprintk("%s: cannot create RPC client. Error = %ld\n", + __FUNCTION__, PTR_ERR(clnt)); + return PTR_ERR(clnt); + } + + clp->cl_rpcclient = clnt; + return 0; +} + +/* + * Version 2 or 3 client destruction + */ +static void nfs_destroy_server(struct nfs_server *server) +{ + if (!IS_ERR(server->client_acl)) + rpc_shutdown_client(server->client_acl); + + if (!(server->flags & NFS_MOUNT_NONLM)) + lockd_down(); /* release rpc.lockd */ +} + +/* + * Version 2 or 3 lockd setup + */ +static int nfs_start_lockd(struct nfs_server *server) +{ + int error = 0; + + if (server->nfs_client->cl_nfsversion > 3) + goto out; + if (server->flags & NFS_MOUNT_NONLM) + goto out; + error = lockd_up(); + if (error < 0) + server->flags |= NFS_MOUNT_NONLM; + else + server->destroy = nfs_destroy_server; +out: + return error; +} + +/* + * Initialise an NFSv3 ACL client connection + */ +#ifdef CONFIG_NFS_V3_ACL +static void nfs_init_server_aclclient(struct nfs_server *server) +{ + if (server->nfs_client->cl_nfsversion != 3) + goto out_noacl; + if (server->flags & NFS_MOUNT_NOACL) + goto out_noacl; + + server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); + if (IS_ERR(server->client_acl)) + goto out_noacl; + + /* No errors! Assume that Sun nfsacls are supported */ + server->caps |= NFS_CAP_ACLS; + return; + +out_noacl: + server->caps &= ~NFS_CAP_ACLS; +} +#else +static inline void nfs_init_server_aclclient(struct nfs_server *server) +{ + server->flags &= ~NFS_MOUNT_NOACL; + server->caps &= ~NFS_CAP_ACLS; +} +#endif + +/* + * Create a general RPC client + */ +static int nfs_init_server_rpcclient(struct nfs_server *server, rpc_authflavor_t pseudoflavour) +{ + struct nfs_client *clp = server->nfs_client; + + server->client = rpc_clone_client(clp->cl_rpcclient); + if (IS_ERR(server->client)) { + dprintk("%s: couldn't create rpc_client!\n", __FUNCTION__); + return PTR_ERR(server->client); + } + + if (pseudoflavour != clp->cl_rpcclient->cl_auth->au_flavor) { + struct rpc_auth *auth; + + auth = rpcauth_create(pseudoflavour, server->client); + if (IS_ERR(auth)) { + dprintk("%s: couldn't create credcache!\n", __FUNCTION__); + return PTR_ERR(auth); + } + } + server->client->cl_softrtry = 0; + if (server->flags & NFS_MOUNT_SOFT) + server->client->cl_softrtry = 1; + + server->client->cl_intr = 0; + if (server->flags & NFS4_MOUNT_INTR) + server->client->cl_intr = 1; + + return 0; +} + +/* + * Initialise an NFS2 or NFS3 client + */ +static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data *data) +{ + int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP; + int error; + + if (clp->cl_cons_state == NFS_CS_READY) { + /* the client is already initialised */ + dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp); + return 0; + } + + /* Check NFS protocol revision and initialize RPC op vector */ + clp->rpc_ops = &nfs_v2_clientops; +#ifdef CONFIG_NFS_V3 + if (clp->cl_nfsversion == 3) + clp->rpc_ops = &nfs_v3_clientops; +#endif + /* + * Create a client RPC handle for doing FSSTAT with UNIX auth only + * - RFC 2623, sec 2.3.2 + */ + error = nfs_create_rpc_client(clp, proto, data->timeo, data->retrans, + RPC_AUTH_UNIX); + if (error < 0) + goto error; + nfs_mark_client_ready(clp, NFS_CS_READY); + return 0; + +error: + nfs_mark_client_ready(clp, error); + dprintk("<-- nfs_init_client() = xerror %d\n", error); + return error; +} + +/* + * Create a version 2 or 3 client + */ +static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_data *data) +{ + struct nfs_client *clp; + int error, nfsvers = 2; + + dprintk("--> nfs_init_server()\n"); + +#ifdef CONFIG_NFS_V3 + if (data->flags & NFS_MOUNT_VER3) + nfsvers = 3; +#endif + + /* Allocate or find a client reference we can use */ + clp = nfs_get_client(data->hostname, &data->addr, nfsvers); + if (IS_ERR(clp)) { + dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); + return PTR_ERR(clp); + } + + error = nfs_init_client(clp, data); + if (error < 0) + goto error; + + server->nfs_client = clp; + + /* Initialise the client representation from the mount data */ + server->flags = data->flags & NFS_MOUNT_FLAGMASK; + + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); + + server->acregmin = data->acregmin * HZ; + server->acregmax = data->acregmax * HZ; + server->acdirmin = data->acdirmin * HZ; + server->acdirmax = data->acdirmax * HZ; + + /* Start lockd here, before we might error out */ + error = nfs_start_lockd(server); + if (error < 0) + goto error; + + error = nfs_init_server_rpcclient(server, data->pseudoflavor); + if (error < 0) + goto error; + + server->namelen = data->namlen; + /* Create a client RPC handle for the NFSv3 ACL management interface */ + nfs_init_server_aclclient(server); + if (clp->cl_nfsversion == 3) { + if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) + server->namelen = NFS3_MAXNAMLEN; + server->caps |= NFS_CAP_READDIRPLUS; + } else { + if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) + server->namelen = NFS2_MAXNAMLEN; + } + + dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp); + return 0; + +error: + server->nfs_client = NULL; + nfs_put_client(clp); + dprintk("<-- nfs_init_server() = xerror %d\n", error); + return error; +} + +/* + * Load up the server record from information gained in an fsinfo record + */ +static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) +{ + unsigned long max_rpc_payload; + + /* Work out a lot of parameters */ + if (server->rsize == 0) + server->rsize = nfs_block_size(fsinfo->rtpref, NULL); + if (server->wsize == 0) + server->wsize = nfs_block_size(fsinfo->wtpref, NULL); + + if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax) + server->rsize = nfs_block_size(fsinfo->rtmax, NULL); + if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) + server->wsize = nfs_block_size(fsinfo->wtmax, NULL); + + max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL); + if (server->rsize > max_rpc_payload) + server->rsize = max_rpc_payload; + if (server->rsize > NFS_MAX_FILE_IO_SIZE) + server->rsize = NFS_MAX_FILE_IO_SIZE; + server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; + + if (server->wsize > max_rpc_payload) + server->wsize = max_rpc_payload; + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + + server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); + if (server->dtsize > PAGE_CACHE_SIZE) + server->dtsize = PAGE_CACHE_SIZE; + if (server->dtsize > server->rsize) + server->dtsize = server->rsize; + + if (server->flags & NFS_MOUNT_NOAC) { + server->acregmin = server->acregmax = 0; + server->acdirmin = server->acdirmax = 0; + } + + server->maxfilesize = fsinfo->maxfilesize; + + /* We're airborne Set socket buffersize */ + rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); +} + +/* + * Probe filesystem information, including the FSID on v2/v3 + */ +static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fattr *fattr) +{ + struct nfs_fsinfo fsinfo; + struct nfs_client *clp = server->nfs_client; + int error; + + dprintk("--> nfs_probe_fsinfo()\n"); + + if (clp->rpc_ops->set_capabilities != NULL) { + error = clp->rpc_ops->set_capabilities(server, mntfh); + if (error < 0) + goto out_error; + } + + fsinfo.fattr = fattr; + nfs_fattr_init(fattr); + error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); + if (error < 0) + goto out_error; + + nfs_server_set_fsinfo(server, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { + struct nfs_pathconf pathinfo; + + pathinfo.fattr = fattr; + nfs_fattr_init(fattr); + + if (clp->rpc_ops->pathconf(server, mntfh, &pathinfo) >= 0) + server->namelen = pathinfo.max_namelen; + } + + dprintk("<-- nfs_probe_fsinfo() = 0\n"); + return 0; + +out_error: + dprintk("nfs_probe_fsinfo: error = %d\n", -error); + return error; +} + +/* + * Copy useful information when duplicating a server record + */ +static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) +{ + target->flags = source->flags; + target->acregmin = source->acregmin; + target->acregmax = source->acregmax; + target->acdirmin = source->acdirmin; + target->acdirmax = source->acdirmax; + target->caps = source->caps; +} + +/* + * Allocate and initialise a server record + */ +static struct nfs_server *nfs_alloc_server(void) +{ + struct nfs_server *server; + + server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); + if (!server) + return NULL; + + server->client = server->client_acl = ERR_PTR(-EINVAL); + + /* Zero out the NFS state stuff */ + INIT_LIST_HEAD(&server->client_link); + INIT_LIST_HEAD(&server->master_link); + + server->io_stats = nfs_alloc_iostats(); + if (!server->io_stats) { + kfree(server); + return NULL; + } + + return server; +} + +/* + * Free up a server record + */ +void nfs_free_server(struct nfs_server *server) +{ + dprintk("--> nfs_free_server()\n"); + + spin_lock(&nfs_client_lock); + list_del(&server->client_link); + list_del(&server->master_link); + spin_unlock(&nfs_client_lock); + + if (server->destroy != NULL) + server->destroy(server); + if (!IS_ERR(server->client)) + rpc_shutdown_client(server->client); + + nfs_put_client(server->nfs_client); + + nfs_free_iostats(server->io_stats); + kfree(server); + nfs_release_automount_timer(); + dprintk("<-- nfs_free_server()\n"); +} + +/* + * Create a version 2 or 3 volume record + * - keyed on server and FSID + */ +struct nfs_server *nfs_create_server(const struct nfs_mount_data *data, + struct nfs_fh *mntfh) +{ + struct nfs_server *server; + struct nfs_fattr fattr; + int error; + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + /* Get a client representation */ + error = nfs_init_server(server, data); + if (error < 0) + goto error; + + BUG_ON(!server->nfs_client); + BUG_ON(!server->nfs_client->rpc_ops); + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* Probe the root fh to retrieve its FSID */ + error = nfs_probe_fsinfo(server, mntfh, &fattr); + if (error < 0) + goto error; + if (!(fattr.valid & NFS_ATTR_FATTR)) { + error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); + if (error < 0) { + dprintk("nfs_create_server: getattr error = %d\n", -error); + goto error; + } + } + memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + BUG_ON(!server->nfs_client); + BUG_ON(!server->nfs_client->rpc_ops); + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + spin_lock(&nfs_client_lock); + list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); + list_add_tail(&server->master_link, &nfs_volume_list); + spin_unlock(&nfs_client_lock); + + server->mount_time = jiffies; + return server; + +error: + nfs_free_server(server); + return ERR_PTR(error); +} + +#ifdef CONFIG_NFS_V4 +/* + * Initialise an NFS4 client record + */ +static int nfs4_init_client(struct nfs_client *clp, + int proto, int timeo, int retrans, + rpc_authflavor_t authflavour) +{ + int error; + + if (clp->cl_cons_state == NFS_CS_READY) { + /* the client is initialised already */ + dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); + return 0; + } + + /* Check NFS protocol revision and initialize RPC op vector */ + clp->rpc_ops = &nfs_v4_clientops; + + error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour); + if (error < 0) + goto error; + + error = nfs_idmap_new(clp); + if (error < 0) { + dprintk("%s: failed to create idmapper. Error = %d\n", + __FUNCTION__, error); + goto error; + } + __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); + + nfs_mark_client_ready(clp, NFS_CS_READY); + return 0; + +error: + nfs_mark_client_ready(clp, error); + dprintk("<-- nfs4_init_client() = xerror %d\n", error); + return error; +} + +/* + * Set up an NFS4 client + */ +static int nfs4_set_client(struct nfs_server *server, + const char *hostname, const struct sockaddr_in *addr, + rpc_authflavor_t authflavour, + int proto, int timeo, int retrans) +{ + struct nfs_client *clp; + int error; + + dprintk("--> nfs4_set_client()\n"); + + /* Allocate or find a client reference we can use */ + clp = nfs_get_client(hostname, addr, 4); + if (IS_ERR(clp)) { + error = PTR_ERR(clp); + goto error; + } + error = nfs4_init_client(clp, proto, timeo, retrans, authflavour); + if (error < 0) + goto error_put; + + server->nfs_client = clp; + dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); + return 0; + +error_put: + nfs_put_client(clp); +error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; +} + +/* + * Create a version 4 volume record + */ +static int nfs4_init_server(struct nfs_server *server, + const struct nfs4_mount_data *data, rpc_authflavor_t authflavour) +{ + int error; + + dprintk("--> nfs4_init_server()\n"); + + /* Initialise the client representation from the mount data */ + server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->caps |= NFS_CAP_ATOMIC_OPEN; + + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); + + server->acregmin = data->acregmin * HZ; + server->acregmax = data->acregmax * HZ; + server->acdirmin = data->acdirmin * HZ; + server->acdirmax = data->acdirmax * HZ; + + error = nfs_init_server_rpcclient(server, authflavour); + + /* Done */ + dprintk("<-- nfs4_init_server() = %d\n", error); + return error; +} + +/* + * Create a version 4 volume record + * - keyed on server and FSID + */ +struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data, + const char *hostname, + const struct sockaddr_in *addr, + const char *mntpath, + const char *ip_addr, + rpc_authflavor_t authflavour, + struct nfs_fh *mntfh) +{ + struct nfs_fattr fattr; + struct nfs_server *server; + int error; + + dprintk("--> nfs4_create_server()\n"); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + /* Get a client record */ + error = nfs4_set_client(server, hostname, addr, authflavour, + data->proto, data->timeo, data->retrans); + if (error < 0) + goto error; + + /* set up the general RPC client */ + error = nfs4_init_server(server, data, authflavour); + if (error < 0) + goto error; + + BUG_ON(!server->nfs_client); + BUG_ON(!server->nfs_client->rpc_ops); + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* Probe the root fh to retrieve its FSID */ + error = nfs4_path_walk(server, mntfh, mntpath); + if (error < 0) + goto error; + + dprintk("Server FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + dprintk("Mount FH: %d\n", mntfh->size); + + error = nfs_probe_fsinfo(server, mntfh, &fattr); + if (error < 0) + goto error; + + BUG_ON(!server->nfs_client); + BUG_ON(!server->nfs_client->rpc_ops); + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + spin_lock(&nfs_client_lock); + list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); + list_add_tail(&server->master_link, &nfs_volume_list); + spin_unlock(&nfs_client_lock); + + server->mount_time = jiffies; + dprintk("<-- nfs4_create_server() = %p\n", server); + return server; + +error: + nfs_free_server(server); + dprintk("<-- nfs4_create_server() = error %d\n", error); + return ERR_PTR(error); +} + +/* + * Create an NFS4 referral server record + */ +struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, + struct nfs_fh *fh) +{ + struct nfs_client *parent_client; + struct nfs_server *server, *parent_server; + struct nfs_fattr fattr; + int error; + + dprintk("--> nfs4_create_referral_server()\n"); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + parent_server = NFS_SB(data->sb); + parent_client = parent_server->nfs_client; + + /* Get a client representation. + * Note: NFSv4 always uses TCP, */ + error = nfs4_set_client(server, data->hostname, data->addr, + data->authflavor, + parent_server->client->cl_xprt->prot, + parent_client->retrans_timeo, + parent_client->retrans_count); + if (error < 0) + goto error; + + /* Initialise the client representation from the parent server */ + nfs_server_copy_userdata(server, parent_server); + server->caps |= NFS_CAP_ATOMIC_OPEN; + + error = nfs_init_server_rpcclient(server, data->authflavor); + if (error < 0) + goto error; + + BUG_ON(!server->nfs_client); + BUG_ON(!server->nfs_client->rpc_ops); + BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + + /* probe the filesystem info for this server filesystem */ + error = nfs_probe_fsinfo(server, fh, &fattr); + if (error < 0) + goto error; + + dprintk("Referral FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + spin_lock(&nfs_client_lock); + list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); + list_add_tail(&server->master_link, &nfs_volume_list); + spin_unlock(&nfs_client_lock); + + server->mount_time = jiffies; + + dprintk("<-- nfs_create_referral_server() = %p\n", server); + return server; + +error: + nfs_free_server(server); + dprintk("<-- nfs4_create_referral_server() = error %d\n", error); + return ERR_PTR(error); +} + +#endif /* CONFIG_NFS_V4 */ + +/* + * Clone an NFS2, NFS3 or NFS4 server record + */ +struct nfs_server *nfs_clone_server(struct nfs_server *source, + struct nfs_fh *fh, + struct nfs_fattr *fattr) +{ + struct nfs_server *server; + struct nfs_fattr fattr_fsinfo; + int error; + + dprintk("--> nfs_clone_server(,%llx:%llx,)\n", + (unsigned long long) fattr->fsid.major, + (unsigned long long) fattr->fsid.minor); + + server = nfs_alloc_server(); + if (!server) + return ERR_PTR(-ENOMEM); + + /* Copy data from the source */ + server->nfs_client = source->nfs_client; + atomic_inc(&server->nfs_client->cl_count); + nfs_server_copy_userdata(server, source); + + server->fsid = fattr->fsid; + + error = nfs_init_server_rpcclient(server, source->client->cl_auth->au_flavor); + if (error < 0) + goto out_free_server; + if (!IS_ERR(source->client_acl)) + nfs_init_server_aclclient(server); + + /* probe the filesystem info for this server filesystem */ + error = nfs_probe_fsinfo(server, fh, &fattr_fsinfo); + if (error < 0) + goto out_free_server; + + dprintk("Cloned FSID: %llx:%llx\n", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + error = nfs_start_lockd(server); + if (error < 0) + goto out_free_server; + + spin_lock(&nfs_client_lock); + list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); + list_add_tail(&server->master_link, &nfs_volume_list); + spin_unlock(&nfs_client_lock); + + server->mount_time = jiffies; + + dprintk("<-- nfs_clone_server() = %p\n", server); + return server; + +out_free_server: + nfs_free_server(server); + dprintk("<-- nfs_clone_server() = error %d\n", error); + return ERR_PTR(error); +} + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *proc_fs_nfs; + +static int nfs_server_list_open(struct inode *inode, struct file *file); +static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); +static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); +static void nfs_server_list_stop(struct seq_file *p, void *v); +static int nfs_server_list_show(struct seq_file *m, void *v); + +static struct seq_operations nfs_server_list_ops = { + .start = nfs_server_list_start, + .next = nfs_server_list_next, + .stop = nfs_server_list_stop, + .show = nfs_server_list_show, +}; + +static struct file_operations nfs_server_list_fops = { + .open = nfs_server_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int nfs_volume_list_open(struct inode *inode, struct file *file); +static void *nfs_volume_list_start(struct seq_file *p, loff_t *pos); +static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos); +static void nfs_volume_list_stop(struct seq_file *p, void *v); +static int nfs_volume_list_show(struct seq_file *m, void *v); + +static struct seq_operations nfs_volume_list_ops = { + .start = nfs_volume_list_start, + .next = nfs_volume_list_next, + .stop = nfs_volume_list_stop, + .show = nfs_volume_list_show, +}; + +static struct file_operations nfs_volume_list_fops = { + .open = nfs_volume_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * open "/proc/fs/nfsfs/servers" which provides a summary of servers with which + * we're dealing + */ +static int nfs_server_list_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &nfs_server_list_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = PDE(inode)->data; + + return 0; +} + +/* + * set up the iterator to start reading from the server list and return the first item + */ +static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) +{ + struct list_head *_p; + loff_t pos = *_pos; + + /* lock the list against modification */ + spin_lock(&nfs_client_lock); + + /* allow for the header line */ + if (!pos) + return SEQ_START_TOKEN; + pos--; + + /* find the n'th element in the list */ + list_for_each(_p, &nfs_client_list) + if (!pos--) + break; + + return _p != &nfs_client_list ? _p : NULL; +} + +/* + * move to next server + */ +static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct list_head *_p; + + (*pos)++; + + _p = v; + _p = (v == SEQ_START_TOKEN) ? nfs_client_list.next : _p->next; + + return _p != &nfs_client_list ? _p : NULL; +} + +/* + * clean up after reading from the transports list + */ +static void nfs_server_list_stop(struct seq_file *p, void *v) +{ + spin_unlock(&nfs_client_lock); +} + +/* + * display a header line followed by a load of call lines + */ +static int nfs_server_list_show(struct seq_file *m, void *v) +{ + struct nfs_client *clp; + + /* display header on line 1 */ + if (v == SEQ_START_TOKEN) { + seq_puts(m, "NV SERVER PORT USE HOSTNAME\n"); + return 0; + } + + /* display one transport per line on subsequent lines */ + clp = list_entry(v, struct nfs_client, cl_share_link); + + seq_printf(m, "v%d %02x%02x%02x%02x %4hx %3d %s\n", + clp->cl_nfsversion, + NIPQUAD(clp->cl_addr.sin_addr), + ntohs(clp->cl_addr.sin_port), + atomic_read(&clp->cl_count), + clp->cl_hostname); + + return 0; +} + +/* + * open "/proc/fs/nfsfs/volumes" which provides a summary of extant volumes + */ +static int nfs_volume_list_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &nfs_volume_list_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = PDE(inode)->data; + + return 0; +} + +/* + * set up the iterator to start reading from the volume list and return the first item + */ +static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) +{ + struct list_head *_p; + loff_t pos = *_pos; + + /* lock the list against modification */ + spin_lock(&nfs_client_lock); + + /* allow for the header line */ + if (!pos) + return SEQ_START_TOKEN; + pos--; + + /* find the n'th element in the list */ + list_for_each(_p, &nfs_volume_list) + if (!pos--) + break; + + return _p != &nfs_volume_list ? _p : NULL; +} + +/* + * move to next volume + */ +static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct list_head *_p; + + (*pos)++; + + _p = v; + _p = (v == SEQ_START_TOKEN) ? nfs_volume_list.next : _p->next; + + return _p != &nfs_volume_list ? _p : NULL; +} + +/* + * clean up after reading from the transports list + */ +static void nfs_volume_list_stop(struct seq_file *p, void *v) +{ + spin_unlock(&nfs_client_lock); +} + +/* + * display a header line followed by a load of call lines + */ +static int nfs_volume_list_show(struct seq_file *m, void *v) +{ + struct nfs_server *server; + struct nfs_client *clp; + char dev[8], fsid[17]; + + /* display header on line 1 */ + if (v == SEQ_START_TOKEN) { + seq_puts(m, "NV SERVER PORT DEV FSID\n"); + return 0; + } + /* display one transport per line on subsequent lines */ + server = list_entry(v, struct nfs_server, master_link); + clp = server->nfs_client; + + snprintf(dev, 8, "%u:%u", + MAJOR(server->s_dev), MINOR(server->s_dev)); + + snprintf(fsid, 17, "%llx:%llx", + (unsigned long long) server->fsid.major, + (unsigned long long) server->fsid.minor); + + seq_printf(m, "v%d %02x%02x%02x%02x %4hx %-7s %-17s\n", + clp->cl_nfsversion, + NIPQUAD(clp->cl_addr.sin_addr), + ntohs(clp->cl_addr.sin_port), + dev, + fsid); + + return 0; +} + +/* + * initialise the /proc/fs/nfsfs/ directory + */ +int __init nfs_fs_proc_init(void) +{ + struct proc_dir_entry *p; + + proc_fs_nfs = proc_mkdir("nfsfs", proc_root_fs); + if (!proc_fs_nfs) + goto error_0; + + proc_fs_nfs->owner = THIS_MODULE; + + /* a file of servers with which we're dealing */ + p = create_proc_entry("servers", S_IFREG|S_IRUGO, proc_fs_nfs); + if (!p) + goto error_1; + + p->proc_fops = &nfs_server_list_fops; + p->owner = THIS_MODULE; + + /* a file of volumes that we have mounted */ + p = create_proc_entry("volumes", S_IFREG|S_IRUGO, proc_fs_nfs); + if (!p) + goto error_2; + + p->proc_fops = &nfs_volume_list_fops; + p->owner = THIS_MODULE; + return 0; + +error_2: + remove_proc_entry("servers", proc_fs_nfs); +error_1: + remove_proc_entry("nfsfs", proc_root_fs); +error_0: + return -ENOMEM; +} + +/* + * clean up the /proc/fs/nfsfs/ directory + */ +void nfs_fs_proc_exit(void) +{ + remove_proc_entry("volumes", proc_fs_nfs); + remove_proc_entry("servers", proc_fs_nfs); + remove_proc_entry("nfsfs", proc_root_fs); +} + +#endif /* CONFIG_PROC_FS */ diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 9540a316c05e..841c99a9b11c 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -18,11 +18,7 @@ #include "nfs4_fs.h" #include "delegation.h" - -static struct nfs_delegation *nfs_alloc_delegation(void) -{ - return (struct nfs_delegation *)kmalloc(sizeof(struct nfs_delegation), GFP_KERNEL); -} +#include "internal.h" static void nfs_free_delegation(struct nfs_delegation *delegation) { @@ -52,7 +48,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ case -NFS4ERR_EXPIRED: /* kill_proc(fl->fl_pid, SIGLOST, 1); */ case -NFS4ERR_STALE_CLIENTID: - nfs4_schedule_state_recovery(NFS_SERVER(inode)->nfs4_state); + nfs4_schedule_state_recovery(NFS_SERVER(inode)->nfs_client); goto out_err; } } @@ -114,7 +110,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st */ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) { - struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state; + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; int status = 0; @@ -123,7 +119,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR))) __nfs_revalidate_inode(NFS_SERVER(inode), inode); - delegation = nfs_alloc_delegation(); + delegation = kmalloc(sizeof(*delegation), GFP_KERNEL); if (delegation == NULL) return -ENOMEM; memcpy(delegation->stateid.data, res->delegation.data, @@ -145,7 +141,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct sizeof(delegation->stateid)) != 0 || delegation->type != nfsi->delegation->type) { printk("%s: server %u.%u.%u.%u, handed out a duplicate delegation!\n", - __FUNCTION__, NIPQUAD(clp->cl_addr)); + __FUNCTION__, NIPQUAD(clp->cl_addr.sin_addr)); status = -EIO; } } @@ -176,7 +172,7 @@ static void nfs_msync_inode(struct inode *inode) */ int __nfs_inode_return_delegation(struct inode *inode) { - struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state; + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; int res = 0; @@ -208,7 +204,7 @@ int __nfs_inode_return_delegation(struct inode *inode) */ void nfs_return_all_delegations(struct super_block *sb) { - struct nfs4_client *clp = NFS_SB(sb)->nfs4_state; + struct nfs_client *clp = NFS_SB(sb)->nfs_client; struct nfs_delegation *delegation; struct inode *inode; @@ -232,7 +228,7 @@ restart: int nfs_do_expire_all_delegations(void *ptr) { - struct nfs4_client *clp = ptr; + struct nfs_client *clp = ptr; struct nfs_delegation *delegation; struct inode *inode; @@ -254,11 +250,11 @@ restart: } out: spin_unlock(&clp->cl_lock); - nfs4_put_client(clp); + nfs_put_client(clp); module_put_and_exit(0); } -void nfs_expire_all_delegations(struct nfs4_client *clp) +void nfs_expire_all_delegations(struct nfs_client *clp) { struct task_struct *task; @@ -266,17 +262,17 @@ void nfs_expire_all_delegations(struct nfs4_client *clp) atomic_inc(&clp->cl_count); task = kthread_run(nfs_do_expire_all_delegations, clp, "%u.%u.%u.%u-delegreturn", - NIPQUAD(clp->cl_addr)); + NIPQUAD(clp->cl_addr.sin_addr)); if (!IS_ERR(task)) return; - nfs4_put_client(clp); + nfs_put_client(clp); module_put(THIS_MODULE); } /* * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. */ -void nfs_handle_cb_pathdown(struct nfs4_client *clp) +void nfs_handle_cb_pathdown(struct nfs_client *clp) { struct nfs_delegation *delegation; struct inode *inode; @@ -299,7 +295,7 @@ restart: struct recall_threadargs { struct inode *inode; - struct nfs4_client *clp; + struct nfs_client *clp; const nfs4_stateid *stateid; struct completion started; @@ -310,7 +306,7 @@ static int recall_thread(void *data) { struct recall_threadargs *args = (struct recall_threadargs *)data; struct inode *inode = igrab(args->inode); - struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state; + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; @@ -371,7 +367,7 @@ out_module_put: /* * Retrieve the inode associated with a delegation */ -struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle) +struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle) { struct nfs_delegation *delegation; struct inode *res = NULL; @@ -389,7 +385,7 @@ struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nf /* * Mark all delegations as needing to be reclaimed */ -void nfs_delegation_mark_reclaim(struct nfs4_client *clp) +void nfs_delegation_mark_reclaim(struct nfs_client *clp) { struct nfs_delegation *delegation; spin_lock(&clp->cl_lock); @@ -401,7 +397,7 @@ void nfs_delegation_mark_reclaim(struct nfs4_client *clp) /* * Reap all unclaimed delegations after reboot recovery is done */ -void nfs_delegation_reap_unclaimed(struct nfs4_client *clp) +void nfs_delegation_reap_unclaimed(struct nfs_client *clp) { struct nfs_delegation *delegation, *n; LIST_HEAD(head); @@ -423,7 +419,7 @@ void nfs_delegation_reap_unclaimed(struct nfs4_client *clp) int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode) { - struct nfs4_client *clp = NFS_SERVER(inode)->nfs4_state; + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; int res = 0; diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 3858694652fa..2cfd4b24c7fe 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -29,13 +29,13 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st int __nfs_inode_return_delegation(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); -struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle); +struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); void nfs_return_all_delegations(struct super_block *sb); -void nfs_expire_all_delegations(struct nfs4_client *clp); -void nfs_handle_cb_pathdown(struct nfs4_client *clp); +void nfs_expire_all_delegations(struct nfs_client *clp); +void nfs_handle_cb_pathdown(struct nfs_client *clp); -void nfs_delegation_mark_reclaim(struct nfs4_client *clp); -void nfs_delegation_reap_unclaimed(struct nfs4_client *clp); +void nfs_delegation_mark_reclaim(struct nfs_client *clp); +void nfs_delegation_reap_unclaimed(struct nfs_client *clp); /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e7ffb4deb3e5..7432f1a43f3d 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -30,7 +30,9 @@ #include <linux/nfs_mount.h> #include <linux/pagemap.h> #include <linux/smp_lock.h> +#include <linux/pagevec.h> #include <linux/namei.h> +#include <linux/mount.h> #include "nfs4_fs.h" #include "delegation.h" @@ -870,14 +872,14 @@ int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) return (nd->intent.open.flags & O_EXCL) != 0; } -static inline int nfs_reval_fsid(struct inode *dir, - struct nfs_fh *fh, struct nfs_fattr *fattr) +static inline int nfs_reval_fsid(struct vfsmount *mnt, struct inode *dir, + struct nfs_fh *fh, struct nfs_fattr *fattr) { struct nfs_server *server = NFS_SERVER(dir); if (!nfs_fsid_equal(&server->fsid, &fattr->fsid)) /* Revalidate fsid on root dir */ - return __nfs_revalidate_inode(server, dir->i_sb->s_root->d_inode); + return __nfs_revalidate_inode(server, mnt->mnt_root->d_inode); return 0; } @@ -902,9 +904,15 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru lock_kernel(); - /* If we're doing an exclusive create, optimize away the lookup */ - if (nfs_is_exclusive_create(dir, nd)) - goto no_entry; + /* + * If we're doing an exclusive create, optimize away the lookup + * but don't hash the dentry. + */ + if (nfs_is_exclusive_create(dir, nd)) { + d_instantiate(dentry, NULL); + res = NULL; + goto out_unlock; + } error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); if (error == -ENOENT) @@ -913,7 +921,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru res = ERR_PTR(error); goto out_unlock; } - error = nfs_reval_fsid(dir, &fhandle, &fattr); + error = nfs_reval_fsid(nd->mnt, dir, &fhandle, &fattr); if (error < 0) { res = ERR_PTR(error); goto out_unlock; @@ -922,8 +930,9 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru res = (struct dentry *)inode; if (IS_ERR(res)) goto out_unlock; + no_entry: - res = d_add_unique(dentry, inode); + res = d_materialise_unique(dentry, inode); if (res != NULL) dentry = res; nfs_renew_times(dentry); @@ -1117,11 +1126,13 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) dput(dentry); return NULL; } - alias = d_add_unique(dentry, inode); + + alias = d_materialise_unique(dentry, inode); if (alias != NULL) { dput(dentry); dentry = alias; } + nfs_renew_times(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); return dentry; @@ -1143,23 +1154,22 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, struct inode *dir = dentry->d_parent->d_inode; error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); if (error) - goto out_err; + return error; } if (!(fattr->valid & NFS_ATTR_FATTR)) { struct nfs_server *server = NFS_SB(dentry->d_sb); - error = server->rpc_ops->getattr(server, fhandle, fattr); + error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); if (error < 0) - goto out_err; + return error; } inode = nfs_fhget(dentry->d_sb, fhandle, fattr); error = PTR_ERR(inode); if (IS_ERR(inode)) - goto out_err; + return error; d_instantiate(dentry, inode); + if (d_unhashed(dentry)) + d_rehash(dentry); return 0; -out_err: - d_drop(dentry); - return error; } /* @@ -1440,48 +1450,82 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry) return error; } -static int -nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +/* + * To create a symbolic link, most file systems instantiate a new inode, + * add a page to it containing the path, then write it out to the disk + * using prepare_write/commit_write. + * + * Unfortunately the NFS client can't create the in-core inode first + * because it needs a file handle to create an in-core inode (see + * fs/nfs/inode.c:nfs_fhget). We only have a file handle *after* the + * symlink request has completed on the server. + * + * So instead we allocate a raw page, copy the symname into it, then do + * the SYMLINK request with the page as the buffer. If it succeeds, we + * now have a new file handle and can instantiate an in-core NFS inode + * and move the raw page into its mapping. + */ +static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { + struct pagevec lru_pvec; + struct page *page; + char *kaddr; struct iattr attr; - struct nfs_fattr sym_attr; - struct nfs_fh sym_fh; - struct qstr qsymname; + unsigned int pathlen = strlen(symname); int error; dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry->d_name.name, symname); -#ifdef NFS_PARANOIA -if (dentry->d_inode) -printk("nfs_proc_symlink: %s/%s not negative!\n", -dentry->d_parent->d_name.name, dentry->d_name.name); -#endif - /* - * Fill in the sattr for the call. - * Note: SunOS 4.1.2 crashes if the mode isn't initialized! - */ - attr.ia_valid = ATTR_MODE; - attr.ia_mode = S_IFLNK | S_IRWXUGO; + if (pathlen > PAGE_SIZE) + return -ENAMETOOLONG; - qsymname.name = symname; - qsymname.len = strlen(symname); + attr.ia_mode = S_IFLNK | S_IRWXUGO; + attr.ia_valid = ATTR_MODE; lock_kernel(); + + page = alloc_page(GFP_KERNEL); + if (!page) { + unlock_kernel(); + return -ENOMEM; + } + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr, symname, pathlen); + if (pathlen < PAGE_SIZE) + memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); + kunmap_atomic(kaddr, KM_USER0); + nfs_begin_data_update(dir); - error = NFS_PROTO(dir)->symlink(dir, &dentry->d_name, &qsymname, - &attr, &sym_fh, &sym_attr); + error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); nfs_end_data_update(dir); - if (!error) { - error = nfs_instantiate(dentry, &sym_fh, &sym_attr); - } else { - if (error == -EEXIST) - printk("nfs_proc_symlink: %s/%s already exists??\n", - dentry->d_parent->d_name.name, dentry->d_name.name); + if (error != 0) { + dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n", + dir->i_sb->s_id, dir->i_ino, + dentry->d_name.name, symname, error); d_drop(dentry); + __free_page(page); + unlock_kernel(); + return error; } + + /* + * No big deal if we can't add this page to the page cache here. + * READLINK will get the missing page from the server if needed. + */ + pagevec_init(&lru_pvec, 0); + if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, + GFP_KERNEL)) { + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + SetPageUptodate(page); + unlock_page(page); + } else + __free_page(page); + unlock_kernel(); - return error; + return 0; } static int @@ -1625,8 +1669,7 @@ out: if (rehash) d_rehash(rehash); if (!error) { - if (!S_ISDIR(old_inode->i_mode)) - d_move(old_dentry, new_dentry); + d_move(old_dentry, new_dentry); nfs_renew_times(new_dentry); nfs_set_verifier(new_dentry, nfs_save_change_attribute(new_dir)); } @@ -1638,35 +1681,211 @@ out: return error; } +static DEFINE_SPINLOCK(nfs_access_lru_lock); +static LIST_HEAD(nfs_access_lru_list); +static atomic_long_t nfs_access_nr_entries; + +static void nfs_access_free_entry(struct nfs_access_entry *entry) +{ + put_rpccred(entry->cred); + kfree(entry); + smp_mb__before_atomic_dec(); + atomic_long_dec(&nfs_access_nr_entries); + smp_mb__after_atomic_dec(); +} + +int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) +{ + LIST_HEAD(head); + struct nfs_inode *nfsi; + struct nfs_access_entry *cache; + + spin_lock(&nfs_access_lru_lock); +restart: + list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { + struct inode *inode; + + if (nr_to_scan-- == 0) + break; + inode = igrab(&nfsi->vfs_inode); + if (inode == NULL) + continue; + spin_lock(&inode->i_lock); + if (list_empty(&nfsi->access_cache_entry_lru)) + goto remove_lru_entry; + cache = list_entry(nfsi->access_cache_entry_lru.next, + struct nfs_access_entry, lru); + list_move(&cache->lru, &head); + rb_erase(&cache->rb_node, &nfsi->access_cache); + if (!list_empty(&nfsi->access_cache_entry_lru)) + list_move_tail(&nfsi->access_cache_inode_lru, + &nfs_access_lru_list); + else { +remove_lru_entry: + list_del_init(&nfsi->access_cache_inode_lru); + clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); + } + spin_unlock(&inode->i_lock); + iput(inode); + goto restart; + } + spin_unlock(&nfs_access_lru_lock); + while (!list_empty(&head)) { + cache = list_entry(head.next, struct nfs_access_entry, lru); + list_del(&cache->lru); + nfs_access_free_entry(cache); + } + return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure; +} + +static void __nfs_access_zap_cache(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct rb_root *root_node = &nfsi->access_cache; + struct rb_node *n, *dispose = NULL; + struct nfs_access_entry *entry; + + /* Unhook entries from the cache */ + while ((n = rb_first(root_node)) != NULL) { + entry = rb_entry(n, struct nfs_access_entry, rb_node); + rb_erase(n, root_node); + list_del(&entry->lru); + n->rb_left = dispose; + dispose = n; + } + nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; + spin_unlock(&inode->i_lock); + + /* Now kill them all! */ + while (dispose != NULL) { + n = dispose; + dispose = n->rb_left; + nfs_access_free_entry(rb_entry(n, struct nfs_access_entry, rb_node)); + } +} + +void nfs_access_zap_cache(struct inode *inode) +{ + /* Remove from global LRU init */ + if (test_and_clear_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) { + spin_lock(&nfs_access_lru_lock); + list_del_init(&NFS_I(inode)->access_cache_inode_lru); + spin_unlock(&nfs_access_lru_lock); + } + + spin_lock(&inode->i_lock); + /* This will release the spinlock */ + __nfs_access_zap_cache(inode); +} + +static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, struct rpc_cred *cred) +{ + struct rb_node *n = NFS_I(inode)->access_cache.rb_node; + struct nfs_access_entry *entry; + + while (n != NULL) { + entry = rb_entry(n, struct nfs_access_entry, rb_node); + + if (cred < entry->cred) + n = n->rb_left; + else if (cred > entry->cred) + n = n->rb_right; + else + return entry; + } + return NULL; +} + int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) { struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_access_entry *cache = &nfsi->cache_access; + struct nfs_access_entry *cache; + int err = -ENOENT; - if (cache->cred != cred - || time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)) - || (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)) - return -ENOENT; - memcpy(res, cache, sizeof(*res)); - return 0; + spin_lock(&inode->i_lock); + if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) + goto out_zap; + cache = nfs_access_search_rbtree(inode, cred); + if (cache == NULL) + goto out; + if (time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))) + goto out_stale; + res->jiffies = cache->jiffies; + res->cred = cache->cred; + res->mask = cache->mask; + list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); + err = 0; +out: + spin_unlock(&inode->i_lock); + return err; +out_stale: + rb_erase(&cache->rb_node, &nfsi->access_cache); + list_del(&cache->lru); + spin_unlock(&inode->i_lock); + nfs_access_free_entry(cache); + return -ENOENT; +out_zap: + /* This will release the spinlock */ + __nfs_access_zap_cache(inode); + return -ENOENT; } -void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) { struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_access_entry *cache = &nfsi->cache_access; + struct rb_root *root_node = &nfsi->access_cache; + struct rb_node **p = &root_node->rb_node; + struct rb_node *parent = NULL; + struct nfs_access_entry *entry; - if (cache->cred != set->cred) { - if (cache->cred) - put_rpccred(cache->cred); - cache->cred = get_rpccred(set->cred); - } - /* FIXME: replace current access_cache BKL reliance with inode->i_lock */ spin_lock(&inode->i_lock); - nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; + while (*p != NULL) { + parent = *p; + entry = rb_entry(parent, struct nfs_access_entry, rb_node); + + if (set->cred < entry->cred) + p = &parent->rb_left; + else if (set->cred > entry->cred) + p = &parent->rb_right; + else + goto found; + } + rb_link_node(&set->rb_node, parent, p); + rb_insert_color(&set->rb_node, root_node); + list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); spin_unlock(&inode->i_lock); + return; +found: + rb_replace_node(parent, &set->rb_node, root_node); + list_add_tail(&set->lru, &nfsi->access_cache_entry_lru); + list_del(&entry->lru); + spin_unlock(&inode->i_lock); + nfs_access_free_entry(entry); +} + +void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +{ + struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); + if (cache == NULL) + return; + RB_CLEAR_NODE(&cache->rb_node); cache->jiffies = set->jiffies; + cache->cred = get_rpccred(set->cred); cache->mask = set->mask; + + nfs_access_add_rbtree(inode, cache); + + /* Update accounting */ + smp_mb__before_atomic_inc(); + atomic_long_inc(&nfs_access_nr_entries); + smp_mb__after_atomic_inc(); + + /* Add inode to global LRU list */ + if (!test_and_set_bit(NFS_INO_ACL_LRU_SET, &NFS_FLAGS(inode))) { + spin_lock(&nfs_access_lru_lock); + list_add_tail(&NFS_I(inode)->access_cache_inode_lru, &nfs_access_lru_list); + spin_unlock(&nfs_access_lru_lock); + } } static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 76ca1cbc38f9..377839bed172 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -855,6 +855,5 @@ int __init nfs_init_directcache(void) */ void nfs_destroy_directcache(void) { - if (kmem_cache_destroy(nfs_direct_cachep)) - printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n"); + kmem_cache_destroy(nfs_direct_cachep); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 48e892880d5b..be997d649127 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -111,7 +111,7 @@ nfs_file_open(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSOPEN); lock_kernel(); - res = NFS_SERVER(inode)->rpc_ops->file_open(inode, filp); + res = NFS_PROTO(inode)->file_open(inode, filp); unlock_kernel(); return res; } @@ -157,7 +157,7 @@ force_reval: static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) { /* origin == SEEK_END => we must revalidate the cached file length */ - if (origin == 2) { + if (origin == SEEK_END) { struct inode *inode = filp->f_mapping->host; int retval = nfs_revalidate_file_size(inode, filp); if (retval < 0) diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c new file mode 100644 index 000000000000..76b08ae9ed82 --- /dev/null +++ b/fs/nfs/getroot.c @@ -0,0 +1,311 @@ +/* getroot.c: get the root dentry for an NFS mount + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/init.h> + +#include <linux/time.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/unistd.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/stats.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_mount.h> +#include <linux/nfs4_mount.h> +#include <linux/lockd/bind.h> +#include <linux/smp_lock.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include <linux/nfs_idmap.h> +#include <linux/vfs.h> +#include <linux/namei.h> +#include <linux/namespace.h> +#include <linux/security.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include "nfs4_fs.h" +#include "delegation.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_CLIENT +#define NFS_PARANOIA 1 + +/* + * get an NFS2/NFS3 root dentry from the root filehandle + */ +struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh) +{ + struct nfs_server *server = NFS_SB(sb); + struct nfs_fsinfo fsinfo; + struct nfs_fattr fattr; + struct dentry *mntroot; + struct inode *inode; + int error; + + /* create a dummy root dentry with dummy inode for this superblock */ + if (!sb->s_root) { + struct nfs_fh dummyfh; + struct dentry *root; + struct inode *iroot; + + memset(&dummyfh, 0, sizeof(dummyfh)); + memset(&fattr, 0, sizeof(fattr)); + nfs_fattr_init(&fattr); + fattr.valid = NFS_ATTR_FATTR; + fattr.type = NFDIR; + fattr.mode = S_IFDIR | S_IRUSR | S_IWUSR; + fattr.nlink = 2; + + iroot = nfs_fhget(sb, &dummyfh, &fattr); + if (IS_ERR(iroot)) + return ERR_PTR(PTR_ERR(iroot)); + + root = d_alloc_root(iroot); + if (!root) { + iput(iroot); + return ERR_PTR(-ENOMEM); + } + + sb->s_root = root; + } + + /* get the actual root for this mount */ + fsinfo.fattr = &fattr; + + error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); + return ERR_PTR(error); + } + + inode = nfs_fhget(sb, mntfh, fsinfo.fattr); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); + return ERR_PTR(PTR_ERR(inode)); + } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ + mntroot = d_alloc_anon(inode); + if (!mntroot) { + iput(inode); + dprintk("nfs_get_root: get root dentry failed\n"); + return ERR_PTR(-ENOMEM); + } + + security_d_instantiate(mntroot, inode); + + if (!mntroot->d_op) + mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; + + return mntroot; +} + +#ifdef CONFIG_NFS_V4 + +/* + * Do a simple pathwalk from the root FH of the server to the nominated target + * of the mountpoint + * - give error on symlinks + * - give error on ".." occurring in the path + * - follow traversals + */ +int nfs4_path_walk(struct nfs_server *server, + struct nfs_fh *mntfh, + const char *path) +{ + struct nfs_fsinfo fsinfo; + struct nfs_fattr fattr; + struct nfs_fh lastfh; + struct qstr name; + int ret; + //int referral_count = 0; + + dprintk("--> nfs4_path_walk(,,%s)\n", path); + + fsinfo.fattr = &fattr; + nfs_fattr_init(&fattr); + + if (*path++ != '/') { + dprintk("nfs4_get_root: Path does not begin with a slash\n"); + return -EINVAL; + } + + /* Start by getting the root filehandle from the server */ + ret = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); + if (ret < 0) { + dprintk("nfs4_get_root: getroot error = %d\n", -ret); + return ret; + } + + if (fattr.type != NFDIR) { + printk(KERN_ERR "nfs4_get_root:" + " getroot encountered non-directory\n"); + return -ENOTDIR; + } + + if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { + printk(KERN_ERR "nfs4_get_root:" + " getroot obtained referral\n"); + return -EREMOTE; + } + +next_component: + dprintk("Next: %s\n", path); + + /* extract the next bit of the path */ + if (!*path) + goto path_walk_complete; + + name.name = path; + while (*path && *path != '/') + path++; + name.len = path - (const char *) name.name; + +eat_dot_dir: + while (*path == '/') + path++; + + if (path[0] == '.' && (path[1] == '/' || !path[1])) { + path += 2; + goto eat_dot_dir; + } + + if (path[0] == '.' && path[1] == '.' && (path[2] == '/' || !path[2]) + ) { + printk(KERN_ERR "nfs4_get_root:" + " Mount path contains reference to \"..\"\n"); + return -EINVAL; + } + + /* lookup the next FH in the sequence */ + memcpy(&lastfh, mntfh, sizeof(lastfh)); + + dprintk("LookupFH: %*.*s [%s]\n", name.len, name.len, name.name, path); + + ret = server->nfs_client->rpc_ops->lookupfh(server, &lastfh, &name, + mntfh, &fattr); + if (ret < 0) { + dprintk("nfs4_get_root: getroot error = %d\n", -ret); + return ret; + } + + if (fattr.type != NFDIR) { + printk(KERN_ERR "nfs4_get_root:" + " lookupfh encountered non-directory\n"); + return -ENOTDIR; + } + + if (fattr.valid & NFS_ATTR_FATTR_V4_REFERRAL) { + printk(KERN_ERR "nfs4_get_root:" + " lookupfh obtained referral\n"); + return -EREMOTE; + } + + goto next_component; + +path_walk_complete: + memcpy(&server->fsid, &fattr.fsid, sizeof(server->fsid)); + dprintk("<-- nfs4_path_walk() = 0\n"); + return 0; +} + +/* + * get an NFS4 root dentry from the root filehandle + */ +struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) +{ + struct nfs_server *server = NFS_SB(sb); + struct nfs_fattr fattr; + struct dentry *mntroot; + struct inode *inode; + int error; + + dprintk("--> nfs4_get_root()\n"); + + /* create a dummy root dentry with dummy inode for this superblock */ + if (!sb->s_root) { + struct nfs_fh dummyfh; + struct dentry *root; + struct inode *iroot; + + memset(&dummyfh, 0, sizeof(dummyfh)); + memset(&fattr, 0, sizeof(fattr)); + nfs_fattr_init(&fattr); + fattr.valid = NFS_ATTR_FATTR; + fattr.type = NFDIR; + fattr.mode = S_IFDIR | S_IRUSR | S_IWUSR; + fattr.nlink = 2; + + iroot = nfs_fhget(sb, &dummyfh, &fattr); + if (IS_ERR(iroot)) + return ERR_PTR(PTR_ERR(iroot)); + + root = d_alloc_root(iroot); + if (!root) { + iput(iroot); + return ERR_PTR(-ENOMEM); + } + + sb->s_root = root; + } + + /* get the info about the server and filesystem */ + error = nfs4_server_capabilities(server, mntfh); + if (error < 0) { + dprintk("nfs_get_root: getcaps error = %d\n", + -error); + return ERR_PTR(error); + } + + /* get the actual root for this mount */ + error = server->nfs_client->rpc_ops->getattr(server, mntfh, &fattr); + if (error < 0) { + dprintk("nfs_get_root: getattr error = %d\n", -error); + return ERR_PTR(error); + } + + inode = nfs_fhget(sb, mntfh, &fattr); + if (IS_ERR(inode)) { + dprintk("nfs_get_root: get root inode failed\n"); + return ERR_PTR(PTR_ERR(inode)); + } + + /* root dentries normally start off anonymous and get spliced in later + * if the dentry tree reaches them; however if the dentry already + * exists, we'll pick it up at this point and use it as the root + */ + mntroot = d_alloc_anon(inode); + if (!mntroot) { + iput(inode); + dprintk("nfs_get_root: get root dentry failed\n"); + return ERR_PTR(-ENOMEM); + } + + security_d_instantiate(mntroot, inode); + + if (!mntroot->d_op) + mntroot->d_op = server->nfs_client->rpc_ops->dentry_ops; + + dprintk("<-- nfs4_get_root()\n"); + return mntroot; +} + +#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 07a5dd57646e..82ad7110a1c0 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -57,6 +57,20 @@ /* Default cache timeout is 10 minutes */ unsigned int nfs_idmap_cache_timeout = 600 * HZ; +static int param_set_idmap_timeout(const char *val, struct kernel_param *kp) +{ + char *endp; + int num = simple_strtol(val, &endp, 0); + int jif = num * HZ; + if (endp == val || *endp || num < 0 || jif < num) + return -EINVAL; + *((int *)kp->arg) = jif; + return 0; +} + +module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int, + &nfs_idmap_cache_timeout, 0644); + struct idmap_hashent { unsigned long ih_expires; __u32 ih_id; @@ -70,7 +84,6 @@ struct idmap_hashtable { }; struct idmap { - char idmap_path[48]; struct dentry *idmap_dentry; wait_queue_head_t idmap_wq; struct idmap_msg idmap_im; @@ -94,24 +107,23 @@ static struct rpc_pipe_ops idmap_upcall_ops = { .destroy_msg = idmap_pipe_destroy_msg, }; -void -nfs_idmap_new(struct nfs4_client *clp) +int +nfs_idmap_new(struct nfs_client *clp) { struct idmap *idmap; + int error; - if (clp->cl_idmap != NULL) - return; - if ((idmap = kzalloc(sizeof(*idmap), GFP_KERNEL)) == NULL) - return; + BUG_ON(clp->cl_idmap != NULL); - snprintf(idmap->idmap_path, sizeof(idmap->idmap_path), - "%s/idmap", clp->cl_rpcclient->cl_pathname); + if ((idmap = kzalloc(sizeof(*idmap), GFP_KERNEL)) == NULL) + return -ENOMEM; - idmap->idmap_dentry = rpc_mkpipe(idmap->idmap_path, + idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap", idmap, &idmap_upcall_ops, 0); if (IS_ERR(idmap->idmap_dentry)) { + error = PTR_ERR(idmap->idmap_dentry); kfree(idmap); - return; + return error; } mutex_init(&idmap->idmap_lock); @@ -121,10 +133,11 @@ nfs_idmap_new(struct nfs4_client *clp) idmap->idmap_group_hash.h_type = IDMAP_TYPE_GROUP; clp->cl_idmap = idmap; + return 0; } void -nfs_idmap_delete(struct nfs4_client *clp) +nfs_idmap_delete(struct nfs_client *clp) { struct idmap *idmap = clp->cl_idmap; @@ -477,27 +490,27 @@ static unsigned int fnvhash32(const void *buf, size_t buflen) return (hash); } -int nfs_map_name_to_uid(struct nfs4_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) { struct idmap *idmap = clp->cl_idmap; return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); } -int nfs_map_group_to_gid(struct nfs4_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) { struct idmap *idmap = clp->cl_idmap; return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); } -int nfs_map_uid_to_name(struct nfs4_client *clp, __u32 uid, char *buf) +int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf) { struct idmap *idmap = clp->cl_idmap; return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); } -int nfs_map_gid_to_group(struct nfs4_client *clp, __u32 uid, char *buf) +int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf) { struct idmap *idmap = clp->cl_idmap; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d349fb2245da..bc9376ca86cd 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -76,19 +76,14 @@ int nfs_write_inode(struct inode *inode, int sync) void nfs_clear_inode(struct inode *inode) { - struct nfs_inode *nfsi = NFS_I(inode); - struct rpc_cred *cred; - /* * The following should never happen... */ BUG_ON(nfs_have_writebacks(inode)); - BUG_ON (!list_empty(&nfsi->open_files)); + BUG_ON(!list_empty(&NFS_I(inode)->open_files)); + BUG_ON(atomic_read(&NFS_I(inode)->data_updates) != 0); nfs_zap_acl_cache(inode); - cred = nfsi->cache_access.cred; - if (cred) - put_rpccred(cred); - BUG_ON(atomic_read(&nfsi->data_updates) != 0); + nfs_access_zap_cache(inode); } /** @@ -242,13 +237,13 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ - inode->i_op = NFS_SB(sb)->rpc_ops->file_inode_ops; + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; if (S_ISREG(inode->i_mode)) { inode->i_fop = &nfs_file_operations; inode->i_data.a_ops = &nfs_file_aops; inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = NFS_SB(sb)->rpc_ops->dir_inode_ops; + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) && fattr->size <= NFS_LIMIT_READDIRPLUS) @@ -282,15 +277,13 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - inode->i_blksize = inode->i_sb->s_blocksize; } else { inode->i_blocks = fattr->du.nfs2.blocks; - inode->i_blksize = fattr->du.nfs2.blocksize; } nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); - nfsi->cache_access.cred = NULL; + nfsi->access_cache = RB_ROOT; unlock_new_inode(inode); } else @@ -448,7 +441,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str { struct nfs_open_context *ctx; - ctx = (struct nfs_open_context *)kmalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (ctx != NULL) { atomic_set(&ctx->count, 1); ctx->dentry = dget(dentry); @@ -722,13 +715,11 @@ void nfs_end_data_update(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); - if (!nfs_have_delegation(inode, FMODE_READ)) { - /* Directories and symlinks: invalidate page cache */ - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_DATA; - spin_unlock(&inode->i_lock); - } + /* Directories: invalidate page cache */ + if (S_ISDIR(inode->i_mode)) { + spin_lock(&inode->i_lock); + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + spin_unlock(&inode->i_lock); } nfsi->cache_change_attribute = jiffies; atomic_dec(&nfsi->data_updates); @@ -847,6 +838,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. + * + * NB: if the server didn't return any post op attributes, this + * function will force the retrieval of attributes before the next + * NFS request. Thus it should be used only for operations that + * are expected to change one or more attributes, to avoid + * unnecessary NFS requests and trips through nfs_update_inode(). */ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) { @@ -970,10 +967,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - inode->i_blksize = inode->i_sb->s_blocksize; } else { inode->i_blocks = fattr->du.nfs2.blocks; - inode->i_blksize = fattr->du.nfs2.blocksize; } if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && @@ -1025,7 +1020,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) out_fileid: printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", - NFS_SERVER(inode)->hostname, inode->i_sb->s_id, + NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id, (long long)nfsi->fileid, (long long)fattr->fileid); goto out_err; } @@ -1109,6 +1104,8 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) INIT_LIST_HEAD(&nfsi->dirty); INIT_LIST_HEAD(&nfsi->commit); INIT_LIST_HEAD(&nfsi->open_files); + INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); + INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); atomic_set(&nfsi->data_updates, 0); nfsi->ndirty = 0; @@ -1133,8 +1130,7 @@ static int __init nfs_init_inodecache(void) static void nfs_destroy_inodecache(void) { - if (kmem_cache_destroy(nfs_inode_cachep)) - printk(KERN_INFO "nfs_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(nfs_inode_cachep); } /* @@ -1144,6 +1140,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_fs_proc_init(); + if (err) + goto out5; + err = nfs_init_nfspagecache(); if (err) goto out4; @@ -1184,6 +1184,8 @@ out2: out3: nfs_destroy_nfspagecache(); out4: + nfs_fs_proc_exit(); +out5: return err; } @@ -1198,6 +1200,7 @@ static void __exit exit_nfs_fs(void) rpc_proc_unregister("nfs"); #endif unregister_nfs_fs(); + nfs_fs_proc_exit(); } /* Not quite true; I just maintain it */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e4f4e5def0fc..bea0b016bd70 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -4,6 +4,18 @@ #include <linux/mount.h> +struct nfs_string; +struct nfs_mount_data; +struct nfs4_mount_data; + +/* Maximum number of readahead requests + * FIXME: this should really be a sysctl so that users may tune it to suit + * their needs. People that do NFS over a slow network, might for + * instance want to reduce it to something closer to 1 for improved + * interactive response. + */ +#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) + struct nfs_clone_mount { const struct super_block *sb; const struct dentry *dentry; @@ -15,7 +27,40 @@ struct nfs_clone_mount { rpc_authflavor_t authflavor; }; -/* namespace-nfs4.c */ +/* client.c */ +extern struct rpc_program nfs_program; + +extern void nfs_put_client(struct nfs_client *); +extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int); +extern struct nfs_server *nfs_create_server(const struct nfs_mount_data *, + struct nfs_fh *); +extern struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *, + const char *, + const struct sockaddr_in *, + const char *, + const char *, + rpc_authflavor_t, + struct nfs_fh *); +extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, + struct nfs_fh *); +extern void nfs_free_server(struct nfs_server *server); +extern struct nfs_server *nfs_clone_server(struct nfs_server *, + struct nfs_fh *, + struct nfs_fattr *); +#ifdef CONFIG_PROC_FS +extern int __init nfs_fs_proc_init(void); +extern void nfs_fs_proc_exit(void); +#else +static inline int nfs_fs_proc_init(void) +{ + return 0; +} +static inline void nfs_fs_proc_exit(void) +{ +} +#endif + +/* nfs4namespace.c */ #ifdef CONFIG_NFS_V4 extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry); #else @@ -46,6 +91,7 @@ extern void nfs_destroy_directcache(void); #endif /* nfs2xdr.c */ +extern int nfs_stat_to_errno(int); extern struct rpc_procinfo nfs_procedures[]; extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int); @@ -54,8 +100,9 @@ extern struct rpc_procinfo nfs3_procedures[]; extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int); /* nfs4xdr.c */ -extern int nfs_stat_to_errno(int); +#ifdef CONFIG_NFS_V4 extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus); +#endif /* nfs4proc.c */ #ifdef CONFIG_NFS_V4 @@ -66,6 +113,9 @@ extern int nfs4_proc_fs_locations(struct inode *dir, struct dentry *dentry, struct page *page); #endif +/* dir.c */ +extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); + /* inode.c */ extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); @@ -76,10 +126,10 @@ extern void nfs4_clear_inode(struct inode *); #endif /* super.c */ -extern struct file_system_type nfs_referral_nfs4_fs_type; -extern struct file_system_type clone_nfs_fs_type; +extern struct file_system_type nfs_xdev_fs_type; #ifdef CONFIG_NFS_V4 -extern struct file_system_type clone_nfs4_fs_type; +extern struct file_system_type nfs4_xdev_fs_type; +extern struct file_system_type nfs4_referral_fs_type; #endif extern struct rpc_stat nfs_rpcstat; @@ -88,30 +138,30 @@ extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); /* namespace.c */ -extern char *nfs_path(const char *base, const struct dentry *dentry, +extern char *nfs_path(const char *base, + const struct dentry *droot, + const struct dentry *dentry, char *buffer, ssize_t buflen); -/* - * Determine the mount path as a string - */ -static inline char * -nfs4_path(const struct dentry *dentry, char *buffer, ssize_t buflen) -{ +/* getroot.c */ +extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); #ifdef CONFIG_NFS_V4 - return nfs_path(NFS_SB(dentry->d_sb)->mnt_path, dentry, buffer, buflen); -#else - return NULL; +extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); + +extern int nfs4_path_walk(struct nfs_server *server, + struct nfs_fh *mntfh, + const char *path); #endif -} /* * Determine the device name as a string */ static inline char *nfs_devname(const struct vfsmount *mnt_parent, - const struct dentry *dentry, - char *buffer, ssize_t buflen) + const struct dentry *dentry, + char *buffer, ssize_t buflen) { - return nfs_path(mnt_parent->mnt_devname, dentry, buffer, buflen); + return nfs_path(mnt_parent->mnt_devname, mnt_parent->mnt_root, + dentry, buffer, buflen); } /* @@ -167,20 +217,3 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0) sb->s_maxbytes = MAX_LFS_FILESIZE; } - -/* - * Check if the string represents a "valid" IPv4 address - */ -static inline int valid_ipaddr4(const char *buf) -{ - int rc, count, in[4]; - - rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); - if (rc != 4) - return -EINVAL; - for (count = 0; count < 4; count++) { - if (in[count] > 255) - return -EINVAL; - } - return 0; -} diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 445abb4d4214..d507b021207f 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -14,7 +14,6 @@ #include <linux/net.h> #include <linux/in.h> #include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/xprt.h> #include <linux/sunrpc/sched.h> #include <linux/nfs_fs.h> @@ -77,22 +76,19 @@ static struct rpc_clnt * mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version, int protocol) { - struct rpc_xprt *xprt; - struct rpc_clnt *clnt; - - xprt = xprt_create_proto(protocol, srvaddr, NULL); - if (IS_ERR(xprt)) - return (struct rpc_clnt *)xprt; - - clnt = rpc_create_client(xprt, hostname, - &mnt_program, version, - RPC_AUTH_UNIX); - if (!IS_ERR(clnt)) { - clnt->cl_softrtry = 1; - clnt->cl_oneshot = 1; - clnt->cl_intr = 1; - } - return clnt; + struct rpc_create_args args = { + .protocol = protocol, + .address = (struct sockaddr *)srvaddr, + .addrsize = sizeof(*srvaddr), + .servername = hostname, + .program = &mnt_program, + .version = version, + .authflavor = RPC_AUTH_UNIX, + .flags = (RPC_CLNT_CREATE_ONESHOT | + RPC_CLNT_CREATE_INTR), + }; + + return rpc_create(&args); } /* diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 86b3169c8cac..60408646176b 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -2,6 +2,7 @@ * linux/fs/nfs/namespace.c * * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com> + * - Modified by David Howells <dhowells@redhat.com> * * NFS namespace */ @@ -25,9 +26,15 @@ LIST_HEAD(nfs_automount_list); static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list); int nfs_mountpoint_expiry_timeout = 500 * HZ; +static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, + const struct dentry *dentry, + struct nfs_fh *fh, + struct nfs_fattr *fattr); + /* * nfs_path - reconstruct the path given an arbitrary dentry * @base - arbitrary string to prepend to the path + * @droot - pointer to root dentry for mountpoint * @dentry - pointer to dentry * @buffer - result buffer * @buflen - length of buffer @@ -38,7 +45,9 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ; * This is mainly for use in figuring out the path on the * server side when automounting on top of an existing partition. */ -char *nfs_path(const char *base, const struct dentry *dentry, +char *nfs_path(const char *base, + const struct dentry *droot, + const struct dentry *dentry, char *buffer, ssize_t buflen) { char *end = buffer+buflen; @@ -47,7 +56,7 @@ char *nfs_path(const char *base, const struct dentry *dentry, *--end = '\0'; buflen--; spin_lock(&dcache_lock); - while (!IS_ROOT(dentry)) { + while (!IS_ROOT(dentry) && dentry != droot) { namelen = dentry->d_name.len; buflen -= namelen + 1; if (buflen < 0) @@ -96,15 +105,18 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) struct nfs_fattr fattr; int err; + dprintk("--> nfs_follow_mountpoint()\n"); + BUG_ON(IS_ROOT(dentry)); dprintk("%s: enter\n", __FUNCTION__); dput(nd->dentry); nd->dentry = dget(dentry); - if (d_mountpoint(nd->dentry)) - goto out_follow; + /* Look it up again */ parent = dget_parent(nd->dentry); - err = server->rpc_ops->lookup(parent->d_inode, &nd->dentry->d_name, &fh, &fattr); + err = server->nfs_client->rpc_ops->lookup(parent->d_inode, + &nd->dentry->d_name, + &fh, &fattr); dput(parent); if (err != 0) goto out_err; @@ -132,6 +144,8 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); out: dprintk("%s: done, returned %d\n", __FUNCTION__, err); + + dprintk("<-- nfs_follow_mountpoint() = %d\n", err); return ERR_PTR(err); out_err: path_release(nd); @@ -172,22 +186,23 @@ void nfs_release_automount_timer(void) /* * Clone a mountpoint of the appropriate type */ -static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devname, +static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, + const char *devname, struct nfs_clone_mount *mountdata) { #ifdef CONFIG_NFS_V4 struct vfsmount *mnt = NULL; - switch (server->rpc_ops->version) { + switch (server->nfs_client->cl_nfsversion) { case 2: case 3: - mnt = vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata); + mnt = vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); break; case 4: - mnt = vfs_kern_mount(&clone_nfs4_fs_type, 0, devname, mountdata); + mnt = vfs_kern_mount(&nfs4_xdev_fs_type, 0, devname, mountdata); } return mnt; #else - return vfs_kern_mount(&clone_nfs_fs_type, 0, devname, mountdata); + return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata); #endif } @@ -199,9 +214,10 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, char *devn * @fattr - attributes for new root inode * */ -struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, - const struct dentry *dentry, struct nfs_fh *fh, - struct nfs_fattr *fattr) +static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, + const struct dentry *dentry, + struct nfs_fh *fh, + struct nfs_fattr *fattr) { struct nfs_clone_mount mountdata = { .sb = mnt_parent->mnt_sb, @@ -213,6 +229,8 @@ struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, char *page = (char *) __get_free_page(GFP_USER); char *devname; + dprintk("--> nfs_do_submount()\n"); + dprintk("%s: submounting on %s/%s\n", __FUNCTION__, dentry->d_parent->d_name.name, dentry->d_name.name); @@ -227,5 +245,7 @@ free_page: free_page((unsigned long)page); out: dprintk("%s: done\n", __FUNCTION__); + + dprintk("<-- nfs_do_submount() = %p\n", mnt); return mnt; } diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 67391eef6b93..b49501fc0a79 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -51,7 +51,7 @@ #define NFS_createargs_sz (NFS_diropargs_sz+NFS_sattr_sz) #define NFS_renameargs_sz (NFS_diropargs_sz+NFS_diropargs_sz) #define NFS_linkargs_sz (NFS_fhandle_sz+NFS_diropargs_sz) -#define NFS_symlinkargs_sz (NFS_diropargs_sz+NFS_path_sz+NFS_sattr_sz) +#define NFS_symlinkargs_sz (NFS_diropargs_sz+1+NFS_sattr_sz) #define NFS_readdirargs_sz (NFS_fhandle_sz+2) #define NFS_attrstat_sz (1+NFS_fattr_sz) @@ -351,11 +351,26 @@ nfs_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs_linkargs *args) static int nfs_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_symlinkargs *args) { + struct xdr_buf *sndbuf = &req->rq_snd_buf; + size_t pad; + p = xdr_encode_fhandle(p, args->fromfh); p = xdr_encode_array(p, args->fromname, args->fromlen); - p = xdr_encode_array(p, args->topath, args->tolen); + *p++ = htonl(args->pathlen); + sndbuf->len = xdr_adjust_iovec(sndbuf->head, p); + + xdr_encode_pages(sndbuf, args->pages, 0, args->pathlen); + + /* + * xdr_encode_pages may have added a few bytes to ensure the + * pathname ends on a 4-byte boundary. Start encoding the + * attributes after the pad bytes. + */ + pad = sndbuf->tail->iov_len; + if (pad > 0) + p++; p = xdr_encode_sattr(p, args->sattr); - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + sndbuf->len += xdr_adjust_iovec(sndbuf->tail, p) - pad; return 0; } diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 7143b1f82cea..3b234d4601e7 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -81,7 +81,7 @@ do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle, } /* - * Bare-bones access to getattr: this is for nfs_read_super. + * Bare-bones access to getattr: this is for nfs_get_root/nfs_get_sb */ static int nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, @@ -90,8 +90,8 @@ nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, int status; status = do_proc_get_root(server->client, fhandle, info); - if (status && server->client_sys != server->client) - status = do_proc_get_root(server->client_sys, fhandle, info); + if (status && server->nfs_client->cl_rpcclient != server->client) + status = do_proc_get_root(server->nfs_client->cl_rpcclient, fhandle, info); return status; } @@ -449,7 +449,7 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr struct nfs_fattr res; } *ptr; - ptr = (struct unlinkxdr *)kmalloc(sizeof(*ptr), GFP_KERNEL); + ptr = kmalloc(sizeof(*ptr), GFP_KERNEL); if (!ptr) return -ENOMEM; ptr->arg.fh = NFS_FH(dir->d_inode); @@ -544,23 +544,23 @@ nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) } static int -nfs3_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path, - struct iattr *sattr, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) +nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, + unsigned int len, struct iattr *sattr) { - struct nfs_fattr dir_attr; + struct nfs_fh fhandle; + struct nfs_fattr fattr, dir_attr; struct nfs3_symlinkargs arg = { .fromfh = NFS_FH(dir), - .fromname = name->name, - .fromlen = name->len, - .topath = path->name, - .tolen = path->len, + .fromname = dentry->d_name.name, + .fromlen = dentry->d_name.len, + .pages = &page, + .pathlen = len, .sattr = sattr }; struct nfs3_diropres res = { .dir_attr = &dir_attr, - .fh = fhandle, - .fattr = fattr + .fh = &fhandle, + .fattr = &fattr }; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK], @@ -569,13 +569,19 @@ nfs3_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path, }; int status; - if (path->len > NFS3_MAXPATHLEN) + if (len > NFS3_MAXPATHLEN) return -ENAMETOOLONG; - dprintk("NFS call symlink %s -> %s\n", name->name, path->name); + + dprintk("NFS call symlink %s\n", dentry->d_name.name); + nfs_fattr_init(&dir_attr); - nfs_fattr_init(fattr); + nfs_fattr_init(&fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_post_op_update_inode(dir, &dir_attr); + if (status != 0) + goto out; + status = nfs_instantiate(dentry, &fhandle, &fattr); +out: dprintk("NFS reply symlink: %d\n", status); return status; } @@ -785,7 +791,7 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("NFS call fsinfo\n"); nfs_fattr_init(info->fattr); - status = rpc_call_sync(server->client_sys, &msg, 0); + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("NFS reply fsinfo: %d\n", status); return status; } @@ -886,7 +892,7 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) return nlmclnt_proc(filp->f_dentry->d_inode, cmd, fl); } -struct nfs_rpc_ops nfs_v3_clientops = { +const struct nfs_rpc_ops nfs_v3_clientops = { .version = 3, /* protocol version */ .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 0250269e9753..16556fa4effb 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -56,7 +56,7 @@ #define NFS3_writeargs_sz (NFS3_fh_sz+5) #define NFS3_createargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) #define NFS3_mkdirargs_sz (NFS3_diropargs_sz+NFS3_sattr_sz) -#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+NFS3_path_sz+NFS3_sattr_sz) +#define NFS3_symlinkargs_sz (NFS3_diropargs_sz+1+NFS3_sattr_sz) #define NFS3_mknodargs_sz (NFS3_diropargs_sz+2+NFS3_sattr_sz) #define NFS3_renameargs_sz (NFS3_diropargs_sz+NFS3_diropargs_sz) #define NFS3_linkargs_sz (NFS3_fh_sz+NFS3_diropargs_sz) @@ -398,8 +398,11 @@ nfs3_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_symlinkargs *args p = xdr_encode_fhandle(p, args->fromfh); p = xdr_encode_array(p, args->fromname, args->fromlen); p = xdr_encode_sattr(p, args->sattr); - p = xdr_encode_array(p, args->topath, args->tolen); + *p++ = htonl(args->pathlen); req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + + /* Copy the page */ + xdr_encode_pages(&req->rq_snd_buf, args->pages, 0, args->pathlen); return 0; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 9a102860df37..61095fe4b5ca 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -43,55 +43,6 @@ enum nfs4_client_state { }; /* - * The nfs4_client identifies our client state to the server. - */ -struct nfs4_client { - struct list_head cl_servers; /* Global list of servers */ - struct in_addr cl_addr; /* Server identifier */ - u64 cl_clientid; /* constant */ - nfs4_verifier cl_confirm; - unsigned long cl_state; - - u32 cl_lockowner_id; - - /* - * The following rwsem ensures exclusive access to the server - * while we recover the state following a lease expiration. - */ - struct rw_semaphore cl_sem; - - struct list_head cl_delegations; - struct list_head cl_state_owners; - struct list_head cl_unused; - int cl_nunused; - spinlock_t cl_lock; - atomic_t cl_count; - - struct rpc_clnt * cl_rpcclient; - - struct list_head cl_superblocks; /* List of nfs_server structs */ - - unsigned long cl_lease_time; - unsigned long cl_last_renewal; - struct work_struct cl_renewd; - struct work_struct cl_recoverd; - - struct rpc_wait_queue cl_rpcwaitq; - - /* used for the setclientid verifier */ - struct timespec cl_boot_time; - - /* idmapper */ - struct idmap * cl_idmap; - - /* Our own IP address, as a null-terminated string. - * This is used to generate the clientid, and the callback address. - */ - char cl_ipaddr[16]; - unsigned char cl_id_uniquifier; -}; - -/* * struct rpc_sequence ensures that RPC calls are sent in the exact * order that they appear on the list. */ @@ -127,7 +78,7 @@ static inline void nfs_confirm_seqid(struct nfs_seqid_counter *seqid, int status struct nfs4_state_owner { spinlock_t so_lock; struct list_head so_list; /* per-clientid list of state_owners */ - struct nfs4_client *so_client; + struct nfs_client *so_client; u32 so_id; /* 32-bit identifier, unique */ atomic_t so_count; @@ -210,10 +161,10 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t); /* nfs4proc.c */ extern int nfs4_map_errors(int err); -extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short, struct rpc_cred *); -extern int nfs4_proc_setclientid_confirm(struct nfs4_client *, struct rpc_cred *); -extern int nfs4_proc_async_renew(struct nfs4_client *, struct rpc_cred *); -extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *); +extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *); +extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *); +extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *); +extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *); extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state); extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); @@ -231,19 +182,14 @@ extern const u32 nfs4_fsinfo_bitmap[2]; extern const u32 nfs4_fs_locations_bitmap[2]; /* nfs4renewd.c */ -extern void nfs4_schedule_state_renewal(struct nfs4_client *); +extern void nfs4_schedule_state_renewal(struct nfs_client *); extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); -extern void nfs4_kill_renewd(struct nfs4_client *); +extern void nfs4_kill_renewd(struct nfs_client *); extern void nfs4_renew_state(void *); /* nfs4state.c */ -extern void init_nfsv4_state(struct nfs_server *); -extern void destroy_nfsv4_state(struct nfs_server *); -extern struct nfs4_client *nfs4_get_client(struct in_addr *); -extern void nfs4_put_client(struct nfs4_client *clp); -extern struct nfs4_client *nfs4_find_client(struct in_addr *); -struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp); -extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *); +struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp); +extern u32 nfs4_alloc_lockowner_id(struct nfs_client *); extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); extern void nfs4_put_state_owner(struct nfs4_state_owner *); @@ -252,7 +198,7 @@ extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct nfs4_state *, mode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t); -extern void nfs4_schedule_state_recovery(struct nfs4_client *); +extern void nfs4_schedule_state_recovery(struct nfs_client *); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); @@ -276,10 +222,6 @@ extern struct svc_version nfs4_callback_version1; #else -#define init_nfsv4_state(server) do { } while (0) -#define destroy_nfsv4_state(server) do { } while (0) -#define nfs4_put_state_owner(inode, owner) do { } while (0) -#define nfs4_put_open_state(state) do { } while (0) #define nfs4_close_state(a, b) do { } while (0) #endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index ea38d27b74e6..24e47f3bbd17 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -2,6 +2,7 @@ * linux/fs/nfs/nfs4namespace.c * * Copyright (C) 2005 Trond Myklebust <Trond.Myklebust@netapp.com> + * - Modified by David Howells <dhowells@redhat.com> * * NFSv4 namespace */ @@ -23,7 +24,7 @@ /* * Check if fs_root is valid */ -static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname, +static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname, char *buffer, ssize_t buflen) { char *end = buffer + buflen; @@ -34,7 +35,7 @@ static inline char *nfs4_pathname_string(struct nfs4_pathname *pathname, n = pathname->ncomponents; while (--n >= 0) { - struct nfs4_string *component = &pathname->components[n]; + const struct nfs4_string *component = &pathname->components[n]; buflen -= component->len + 1; if (buflen < 0) goto Elong; @@ -47,6 +48,68 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } +/* + * Determine the mount path as a string + */ +static char *nfs4_path(const struct vfsmount *mnt_parent, + const struct dentry *dentry, + char *buffer, ssize_t buflen) +{ + const char *srvpath; + + srvpath = strchr(mnt_parent->mnt_devname, ':'); + if (srvpath) + srvpath++; + else + srvpath = mnt_parent->mnt_devname; + + return nfs_path(srvpath, mnt_parent->mnt_root, dentry, buffer, buflen); +} + +/* + * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we + * believe to be the server path to this dentry + */ +static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, + const struct dentry *dentry, + const struct nfs4_fs_locations *locations, + char *page, char *page2) +{ + const char *path, *fs_path; + + path = nfs4_path(mnt_parent, dentry, page, PAGE_SIZE); + if (IS_ERR(path)) + return PTR_ERR(path); + + fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE); + if (IS_ERR(fs_path)) + return PTR_ERR(fs_path); + + if (strncmp(path, fs_path, strlen(fs_path)) != 0) { + dprintk("%s: path %s does not begin with fsroot %s\n", + __FUNCTION__, path, fs_path); + return -ENOENT; + } + + return 0; +} + +/* + * Check if the string represents a "valid" IPv4 address + */ +static inline int valid_ipaddr4(const char *buf) +{ + int rc, count, in[4]; + + rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); + if (rc != 4) + return -EINVAL; + for (count = 0; count < 4; count++) { + if (in[count] > 255) + return -EINVAL; + } + return 0; +} /** * nfs_follow_referral - set up mountpoint when hitting a referral on moved error @@ -60,7 +123,7 @@ Elong: */ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, const struct dentry *dentry, - struct nfs4_fs_locations *locations) + const struct nfs4_fs_locations *locations) { struct vfsmount *mnt = ERR_PTR(-ENOENT); struct nfs_clone_mount mountdata = { @@ -68,10 +131,9 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, .dentry = dentry, .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, }; - char *page, *page2; - char *path, *fs_path; + char *page = NULL, *page2 = NULL; char *devname; - int loc, s; + int loc, s, error; if (locations == NULL || locations->nlocations <= 0) goto out; @@ -79,36 +141,30 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, dprintk("%s: referral at %s/%s\n", __FUNCTION__, dentry->d_parent->d_name.name, dentry->d_name.name); - /* Ensure fs path is a prefix of current dentry path */ page = (char *) __get_free_page(GFP_USER); - if (page == NULL) + if (!page) goto out; + page2 = (char *) __get_free_page(GFP_USER); - if (page2 == NULL) + if (!page2) goto out; - path = nfs4_path(dentry, page, PAGE_SIZE); - if (IS_ERR(path)) - goto out_free; - - fs_path = nfs4_pathname_string(&locations->fs_path, page2, PAGE_SIZE); - if (IS_ERR(fs_path)) - goto out_free; - - if (strncmp(path, fs_path, strlen(fs_path)) != 0) { - dprintk("%s: path %s does not begin with fsroot %s\n", __FUNCTION__, path, fs_path); - goto out_free; + /* Ensure fs path is a prefix of current dentry path */ + error = nfs4_validate_fspath(mnt_parent, dentry, locations, page, page2); + if (error < 0) { + mnt = ERR_PTR(error); + goto out; } devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE); if (IS_ERR(devname)) { mnt = (struct vfsmount *)devname; - goto out_free; + goto out; } loc = 0; while (loc < locations->nlocations && IS_ERR(mnt)) { - struct nfs4_fs_location *location = &locations->locations[loc]; + const struct nfs4_fs_location *location = &locations->locations[loc]; char *mnt_path; if (location == NULL || location->nservers <= 0 || @@ -140,7 +196,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, addr.sin_port = htons(NFS_PORT); mountdata.addr = &addr; - mnt = vfs_kern_mount(&nfs_referral_nfs4_fs_type, 0, devname, &mountdata); + mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, devname, &mountdata); if (!IS_ERR(mnt)) { break; } @@ -149,10 +205,9 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, loc++; } -out_free: - free_page((unsigned long)page); - free_page((unsigned long)page2); out: + free_page((unsigned long) page); + free_page((unsigned long) page2); dprintk("%s: done\n", __FUNCTION__); return mnt; } @@ -165,7 +220,7 @@ out: */ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) { - struct vfsmount *mnt = ERR_PTR(-ENOENT); + struct vfsmount *mnt = ERR_PTR(-ENOMEM); struct dentry *parent; struct nfs4_fs_locations *fs_locations = NULL; struct page *page; @@ -183,11 +238,16 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr goto out_free; /* Get locations */ + mnt = ERR_PTR(-ENOENT); + parent = dget_parent(dentry); - dprintk("%s: getting locations for %s/%s\n", __FUNCTION__, parent->d_name.name, dentry->d_name.name); + dprintk("%s: getting locations for %s/%s\n", + __FUNCTION__, parent->d_name.name, dentry->d_name.name); + err = nfs4_proc_fs_locations(parent->d_inode, dentry, fs_locations, page); dput(parent); - if (err != 0 || fs_locations->nlocations <= 0 || + if (err != 0 || + fs_locations->nlocations <= 0 || fs_locations->fs_path.ncomponents <= 0) goto out_free; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b14145b7b87f..47c7e6e3910d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -55,7 +55,7 @@ #define NFSDBG_FACILITY NFSDBG_PROC -#define NFS4_POLL_RETRY_MIN (1*HZ) +#define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) struct nfs4_opendata; @@ -64,7 +64,7 @@ static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinf static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *); static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); -static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp); +static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp); /* Prevent leaks of NFSv4 errors into userland */ int nfs4_map_errors(int err) @@ -195,7 +195,7 @@ static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry, static void renew_lease(const struct nfs_server *server, unsigned long timestamp) { - struct nfs4_client *clp = server->nfs4_state; + struct nfs_client *clp = server->nfs_client; spin_lock(&clp->cl_lock); if (time_before(clp->cl_last_renewal,timestamp)) clp->cl_last_renewal = timestamp; @@ -252,7 +252,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, atomic_inc(&sp->so_count); p->o_arg.fh = NFS_FH(dir); p->o_arg.open_flags = flags, - p->o_arg.clientid = server->nfs4_state->cl_clientid; + p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id = sp->so_id; p->o_arg.name = &dentry->d_name; p->o_arg.server = server; @@ -550,7 +550,7 @@ int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ - nfs4_schedule_state_recovery(server->nfs4_state); + nfs4_schedule_state_recovery(server->nfs_client); return err; } err = nfs4_handle_exception(server, err, &exception); @@ -758,7 +758,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) } nfs_confirm_seqid(&data->owner->so_seqid, 0); if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) - return server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr); + return server->nfs_client->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr); return 0; } @@ -792,11 +792,18 @@ out: int nfs4_recover_expired_lease(struct nfs_server *server) { - struct nfs4_client *clp = server->nfs4_state; + struct nfs_client *clp = server->nfs_client; + int ret; - if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + for (;;) { + ret = nfs4_wait_clnt_recover(server->client, clp); + if (ret != 0) + return ret; + if (!test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + break; nfs4_schedule_state_recovery(clp); - return nfs4_wait_clnt_recover(server->client, clp); + } + return 0; } /* @@ -867,7 +874,7 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred { struct nfs_delegation *delegation; struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_client *clp = server->nfs4_state; + struct nfs_client *clp = server->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); struct nfs4_state_owner *sp = NULL; struct nfs4_state *state = NULL; @@ -953,7 +960,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st struct nfs4_state_owner *sp; struct nfs4_state *state = NULL; struct nfs_server *server = NFS_SERVER(dir); - struct nfs4_client *clp = server->nfs4_state; + struct nfs_client *clp = server->nfs_client; struct nfs4_opendata *opendata; int status; @@ -1133,7 +1140,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) break; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(server->nfs4_state); + nfs4_schedule_state_recovery(server->nfs_client); break; default: if (nfs4_async_handle_error(task, server) == -EAGAIN) { @@ -1268,7 +1275,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) BUG_ON(nd->intent.open.flags & O_CREAT); } - cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); + cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); if (IS_ERR(cred)) return (struct dentry *)cred; state = nfs4_do_open(dir, dentry, nd->intent.open.flags, &attr, cred); @@ -1291,7 +1298,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st struct rpc_cred *cred; struct nfs4_state *state; - cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); + cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); if (IS_ERR(cred)) return PTR_ERR(cred); state = nfs4_open_delegated(dentry->d_inode, openflags, cred); @@ -1393,70 +1400,19 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, return err; } +/* + * get the file handle for the "/" directory on the server + */ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fsinfo *info) { - struct nfs_fattr * fattr = info->fattr; - unsigned char * p; - struct qstr q; - struct nfs4_lookup_arg args = { - .dir_fh = fhandle, - .name = &q, - .bitmask = nfs4_fattr_bitmap, - }; - struct nfs4_lookup_res res = { - .server = server, - .fattr = fattr, - .fh = fhandle, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP], - .rpc_argp = &args, - .rpc_resp = &res, - }; int status; - /* - * Now we do a separate LOOKUP for each component of the mount path. - * The LOOKUPs are done separately so that we can conveniently - * catch an ERR_WRONGSEC if it occurs along the way... - */ status = nfs4_lookup_root(server, fhandle, info); - if (status) - goto out; - - p = server->mnt_path; - for (;;) { - struct nfs4_exception exception = { }; - - while (*p == '/') - p++; - if (!*p) - break; - q.name = p; - while (*p && (*p != '/')) - p++; - q.len = p - q.name; - - do { - nfs_fattr_init(fattr); - status = nfs4_handle_exception(server, - rpc_call_sync(server->client, &msg, 0), - &exception); - } while (exception.retry); - if (status == 0) - continue; - if (status == -ENOENT) { - printk(KERN_NOTICE "NFS: mount path %s does not exist!\n", server->mnt_path); - printk(KERN_NOTICE "NFS: suggestion: try mounting '/' instead.\n"); - } - break; - } if (status == 0) status = nfs4_server_capabilities(server, fhandle); if (status == 0) status = nfs4_do_fsinfo(server, fhandle, info); -out: return nfs4_map_errors(status); } @@ -1565,7 +1521,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); - cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0); + cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); if (IS_ERR(cred)) return PTR_ERR(cred); @@ -1583,6 +1539,52 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, return status; } +static int _nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, + struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) +{ + int status; + struct nfs4_lookup_arg args = { + .bitmask = server->attr_bitmask, + .dir_fh = dirfh, + .name = name, + }; + struct nfs4_lookup_res res = { + .server = server, + .fattr = fattr, + .fh = fhandle, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOOKUP], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + nfs_fattr_init(fattr); + + dprintk("NFS call lookupfh %s\n", name->name); + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply lookupfh: %d\n", status); + if (status == -NFS4ERR_MOVED) + status = -EREMOTE; + return status; +} + +static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, + struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs4_proc_lookupfh(server, dirfh, name, + fhandle, fattr), + &exception); + } while (exception.retry); + return err; +} + static int _nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { @@ -1881,7 +1883,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, struct rpc_cred *cred; int status = 0; - cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); + cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); if (IS_ERR(cred)) { status = PTR_ERR(cred); goto out; @@ -2089,24 +2091,24 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n return err; } -static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name, - struct qstr *path, struct iattr *sattr, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) +static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, + struct page *page, unsigned int len, struct iattr *sattr) { struct nfs_server *server = NFS_SERVER(dir); - struct nfs_fattr dir_fattr; + struct nfs_fh fhandle; + struct nfs_fattr fattr, dir_fattr; struct nfs4_create_arg arg = { .dir_fh = NFS_FH(dir), .server = server, - .name = name, + .name = &dentry->d_name, .attrs = sattr, .ftype = NF4LNK, .bitmask = server->attr_bitmask, }; struct nfs4_create_res res = { .server = server, - .fh = fhandle, - .fattr = fattr, + .fh = &fhandle, + .fattr = &fattr, .dir_fattr = &dir_fattr, }; struct rpc_message msg = { @@ -2116,29 +2118,32 @@ static int _nfs4_proc_symlink(struct inode *dir, struct qstr *name, }; int status; - if (path->len > NFS4_MAXPATHLEN) + if (len > NFS4_MAXPATHLEN) return -ENAMETOOLONG; - arg.u.symlink = path; - nfs_fattr_init(fattr); + + arg.u.symlink.pages = &page; + arg.u.symlink.len = len; + nfs_fattr_init(&fattr); nfs_fattr_init(&dir_fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); - if (!status) + if (!status) { update_changeattr(dir, &res.dir_cinfo); - nfs_post_op_update_inode(dir, res.dir_fattr); + nfs_post_op_update_inode(dir, res.dir_fattr); + status = nfs_instantiate(dentry, &fhandle, &fattr); + } return status; } -static int nfs4_proc_symlink(struct inode *dir, struct qstr *name, - struct qstr *path, struct iattr *sattr, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) +static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, + struct page *page, unsigned int len, struct iattr *sattr) { struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_symlink(dir, name, path, sattr, - fhandle, fattr), + _nfs4_proc_symlink(dir, dentry, page, + len, sattr), &exception); } while (exception.retry); return err; @@ -2521,7 +2526,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, int how) */ static void nfs4_renew_done(struct rpc_task *task, void *data) { - struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp; + struct nfs_client *clp = (struct nfs_client *)task->tk_msg.rpc_argp; unsigned long timestamp = (unsigned long)data; if (task->tk_status < 0) { @@ -2543,7 +2548,7 @@ static const struct rpc_call_ops nfs4_renew_ops = { .rpc_call_done = nfs4_renew_done, }; -int nfs4_proc_async_renew(struct nfs4_client *clp, struct rpc_cred *cred) +int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], @@ -2555,7 +2560,7 @@ int nfs4_proc_async_renew(struct nfs4_client *clp, struct rpc_cred *cred) &nfs4_renew_ops, (void *)jiffies); } -int nfs4_proc_renew(struct nfs4_client *clp, struct rpc_cred *cred) +int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], @@ -2770,7 +2775,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl return -EOPNOTSUPP; nfs_inode_return_delegation(inode); buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); - ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0); + ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (ret == 0) nfs4_write_cached_acl(inode, buf, buflen); return ret; @@ -2791,7 +2796,7 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen static int nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) { - struct nfs4_client *clp = server->nfs4_state; + struct nfs_client *clp = server->nfs_client; if (!clp || task->tk_status >= 0) return 0; @@ -2828,7 +2833,7 @@ static int nfs4_wait_bit_interruptible(void *word) return 0; } -static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp) +static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp) { sigset_t oldset; int res; @@ -2871,7 +2876,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) */ int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception) { - struct nfs4_client *clp = server->nfs4_state; + struct nfs_client *clp = server->nfs_client; int ret = errorcode; exception->retry = 0; @@ -2886,6 +2891,7 @@ int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct if (ret == 0) exception->retry = 1; break; + case -NFS4ERR_FILE_OPEN: case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: ret = nfs4_delay(server->client, &exception->timeout); @@ -2898,7 +2904,7 @@ int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct return nfs4_map_errors(ret); } -int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) +int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) { nfs4_verifier sc_verifier; struct nfs4_setclientid setclientid = { @@ -2922,7 +2928,7 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p for(;;) { setclientid.sc_name_len = scnprintf(setclientid.sc_name, sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u %s %u", - clp->cl_ipaddr, NIPQUAD(clp->cl_addr.s_addr), + clp->cl_ipaddr, NIPQUAD(clp->cl_addr.sin_addr), cred->cr_ops->cr_name, clp->cl_id_uniquifier); setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, @@ -2945,7 +2951,7 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p return status; } -static int _nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cred *cred) +static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) { struct nfs_fsinfo fsinfo; struct rpc_message msg = { @@ -2969,7 +2975,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cr return status; } -int nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cred *cred) +int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred) { long timeout; int err; @@ -3077,7 +3083,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 switch (err) { case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(server->nfs4_state); + nfs4_schedule_state_recovery(server->nfs_client); case 0: return 0; } @@ -3106,7 +3112,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock { struct inode *inode = state->inode; struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_client *clp = server->nfs4_state; + struct nfs_client *clp = server->nfs_client; struct nfs_lockt_args arg = { .fh = NFS_FH(inode), .fl = request, @@ -3231,7 +3237,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) break; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(calldata->server->nfs4_state); + nfs4_schedule_state_recovery(calldata->server->nfs_client); break; default: if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) { @@ -3343,7 +3349,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, if (p->arg.lock_seqid == NULL) goto out_free; p->arg.lock_stateid = &lsp->ls_stateid; - p->arg.lock_owner.clientid = server->nfs4_state->cl_clientid; + p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; p->arg.lock_owner.id = lsp->ls_id; p->lsp = lsp; atomic_inc(&lsp->ls_count); @@ -3513,7 +3519,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs4_client *clp = state->owner->so_client; + struct nfs_client *clp = state->owner->so_client; unsigned char fl_flags = request->fl_flags; int status; @@ -3715,7 +3721,7 @@ static struct inode_operations nfs4_file_inode_operations = { .listxattr = nfs4_listxattr, }; -struct nfs_rpc_ops nfs_v4_clientops = { +const struct nfs_rpc_ops nfs_v4_clientops = { .version = 4, /* protocol version */ .dentry_ops = &nfs4_dentry_operations, .dir_inode_ops = &nfs4_dir_inode_operations, @@ -3723,6 +3729,7 @@ struct nfs_rpc_ops nfs_v4_clientops = { .getroot = nfs4_proc_get_root, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, + .lookupfh = nfs4_proc_lookupfh, .lookup = nfs4_proc_lookup, .access = nfs4_proc_access, .readlink = nfs4_proc_readlink, @@ -3743,6 +3750,7 @@ struct nfs_rpc_ops nfs_v4_clientops = { .statfs = nfs4_proc_statfs, .fsinfo = nfs4_proc_fsinfo, .pathconf = nfs4_proc_pathconf, + .set_capabilities = nfs4_server_capabilities, .decode_dirent = nfs4_decode_dirent, .read_setup = nfs4_proc_read_setup, .read_done = nfs4_read_done, diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 5d764d8e6d8a..7b6df1852e75 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -61,7 +61,7 @@ void nfs4_renew_state(void *data) { - struct nfs4_client *clp = (struct nfs4_client *)data; + struct nfs_client *clp = (struct nfs_client *)data; struct rpc_cred *cred; long lease, timeout; unsigned long last, now; @@ -108,7 +108,7 @@ out: /* Must be called with clp->cl_sem locked for writes */ void -nfs4_schedule_state_renewal(struct nfs4_client *clp) +nfs4_schedule_state_renewal(struct nfs_client *clp) { long timeout; @@ -121,32 +121,20 @@ nfs4_schedule_state_renewal(struct nfs4_client *clp) __FUNCTION__, (timeout + HZ - 1) / HZ); cancel_delayed_work(&clp->cl_renewd); schedule_delayed_work(&clp->cl_renewd, timeout); + set_bit(NFS_CS_RENEWD, &clp->cl_res_state); spin_unlock(&clp->cl_lock); } void nfs4_renewd_prepare_shutdown(struct nfs_server *server) { - struct nfs4_client *clp = server->nfs4_state; - - if (!clp) - return; flush_scheduled_work(); - down_write(&clp->cl_sem); - if (!list_empty(&server->nfs4_siblings)) - list_del_init(&server->nfs4_siblings); - up_write(&clp->cl_sem); } -/* Must be called with clp->cl_sem locked for writes */ void -nfs4_kill_renewd(struct nfs4_client *clp) +nfs4_kill_renewd(struct nfs_client *clp) { down_read(&clp->cl_sem); - if (!list_empty(&clp->cl_superblocks)) { - up_read(&clp->cl_sem); - return; - } cancel_delayed_work(&clp->cl_renewd); up_read(&clp->cl_sem); flush_scheduled_work(); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 090a36b07a22..5fffbdfa971f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -50,149 +50,15 @@ #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" +#include "internal.h" #define OPENOWNER_POOL_SIZE 8 const nfs4_stateid zero_stateid; -static DEFINE_SPINLOCK(state_spinlock); static LIST_HEAD(nfs4_clientid_list); -void -init_nfsv4_state(struct nfs_server *server) -{ - server->nfs4_state = NULL; - INIT_LIST_HEAD(&server->nfs4_siblings); -} - -void -destroy_nfsv4_state(struct nfs_server *server) -{ - kfree(server->mnt_path); - server->mnt_path = NULL; - if (server->nfs4_state) { - nfs4_put_client(server->nfs4_state); - server->nfs4_state = NULL; - } -} - -/* - * nfs4_get_client(): returns an empty client structure - * nfs4_put_client(): drops reference to client structure - * - * Since these are allocated/deallocated very rarely, we don't - * bother putting them in a slab cache... - */ -static struct nfs4_client * -nfs4_alloc_client(struct in_addr *addr) -{ - struct nfs4_client *clp; - - if (nfs_callback_up() < 0) - return NULL; - if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) { - nfs_callback_down(); - return NULL; - } - memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr)); - init_rwsem(&clp->cl_sem); - INIT_LIST_HEAD(&clp->cl_delegations); - INIT_LIST_HEAD(&clp->cl_state_owners); - INIT_LIST_HEAD(&clp->cl_unused); - spin_lock_init(&clp->cl_lock); - atomic_set(&clp->cl_count, 1); - INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp); - INIT_LIST_HEAD(&clp->cl_superblocks); - rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client"); - clp->cl_rpcclient = ERR_PTR(-EINVAL); - clp->cl_boot_time = CURRENT_TIME; - clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; - return clp; -} - -static void -nfs4_free_client(struct nfs4_client *clp) -{ - struct nfs4_state_owner *sp; - - while (!list_empty(&clp->cl_unused)) { - sp = list_entry(clp->cl_unused.next, - struct nfs4_state_owner, - so_list); - list_del(&sp->so_list); - kfree(sp); - } - BUG_ON(!list_empty(&clp->cl_state_owners)); - nfs_idmap_delete(clp); - if (!IS_ERR(clp->cl_rpcclient)) - rpc_shutdown_client(clp->cl_rpcclient); - kfree(clp); - nfs_callback_down(); -} - -static struct nfs4_client *__nfs4_find_client(struct in_addr *addr) -{ - struct nfs4_client *clp; - list_for_each_entry(clp, &nfs4_clientid_list, cl_servers) { - if (memcmp(&clp->cl_addr, addr, sizeof(clp->cl_addr)) == 0) { - atomic_inc(&clp->cl_count); - return clp; - } - } - return NULL; -} - -struct nfs4_client *nfs4_find_client(struct in_addr *addr) -{ - struct nfs4_client *clp; - spin_lock(&state_spinlock); - clp = __nfs4_find_client(addr); - spin_unlock(&state_spinlock); - return clp; -} - -struct nfs4_client * -nfs4_get_client(struct in_addr *addr) -{ - struct nfs4_client *clp, *new = NULL; - - spin_lock(&state_spinlock); - for (;;) { - clp = __nfs4_find_client(addr); - if (clp != NULL) - break; - clp = new; - if (clp != NULL) { - list_add(&clp->cl_servers, &nfs4_clientid_list); - new = NULL; - break; - } - spin_unlock(&state_spinlock); - new = nfs4_alloc_client(addr); - spin_lock(&state_spinlock); - if (new == NULL) - break; - } - spin_unlock(&state_spinlock); - if (new) - nfs4_free_client(new); - return clp; -} - -void -nfs4_put_client(struct nfs4_client *clp) -{ - if (!atomic_dec_and_lock(&clp->cl_count, &state_spinlock)) - return; - list_del(&clp->cl_servers); - spin_unlock(&state_spinlock); - BUG_ON(!list_empty(&clp->cl_superblocks)); - rpc_wake_up(&clp->cl_rpcwaitq); - nfs4_kill_renewd(clp); - nfs4_free_client(clp); -} - -static int nfs4_init_client(struct nfs4_client *clp, struct rpc_cred *cred) +static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) { int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, nfs_callback_tcpport, cred); @@ -204,13 +70,13 @@ static int nfs4_init_client(struct nfs4_client *clp, struct rpc_cred *cred) } u32 -nfs4_alloc_lockowner_id(struct nfs4_client *clp) +nfs4_alloc_lockowner_id(struct nfs_client *clp) { return clp->cl_lockowner_id ++; } static struct nfs4_state_owner * -nfs4_client_grab_unused(struct nfs4_client *clp, struct rpc_cred *cred) +nfs4_client_grab_unused(struct nfs_client *clp, struct rpc_cred *cred) { struct nfs4_state_owner *sp = NULL; @@ -224,7 +90,7 @@ nfs4_client_grab_unused(struct nfs4_client *clp, struct rpc_cred *cred) return sp; } -struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp) +struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) { struct nfs4_state_owner *sp; struct rpc_cred *cred = NULL; @@ -238,7 +104,7 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp) return cred; } -struct rpc_cred *nfs4_get_setclientid_cred(struct nfs4_client *clp) +struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) { struct nfs4_state_owner *sp; @@ -251,7 +117,7 @@ struct rpc_cred *nfs4_get_setclientid_cred(struct nfs4_client *clp) } static struct nfs4_state_owner * -nfs4_find_state_owner(struct nfs4_client *clp, struct rpc_cred *cred) +nfs4_find_state_owner(struct nfs_client *clp, struct rpc_cred *cred) { struct nfs4_state_owner *sp, *res = NULL; @@ -294,7 +160,7 @@ nfs4_alloc_state_owner(void) void nfs4_drop_state_owner(struct nfs4_state_owner *sp) { - struct nfs4_client *clp = sp->so_client; + struct nfs_client *clp = sp->so_client; spin_lock(&clp->cl_lock); list_del_init(&sp->so_list); spin_unlock(&clp->cl_lock); @@ -306,7 +172,7 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp) */ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred) { - struct nfs4_client *clp = server->nfs4_state; + struct nfs_client *clp = server->nfs_client; struct nfs4_state_owner *sp, *new; get_rpccred(cred); @@ -337,7 +203,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct */ void nfs4_put_state_owner(struct nfs4_state_owner *sp) { - struct nfs4_client *clp = sp->so_client; + struct nfs_client *clp = sp->so_client; struct rpc_cred *cred = sp->so_cred; if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) @@ -540,7 +406,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) { struct nfs4_lock_state *lsp; - struct nfs4_client *clp = state->owner->so_client; + struct nfs_client *clp = state->owner->so_client; lsp = kzalloc(sizeof(*lsp), GFP_KERNEL); if (lsp == NULL) @@ -752,7 +618,7 @@ out: static int reclaimer(void *); -static inline void nfs4_clear_recover_bit(struct nfs4_client *clp) +static inline void nfs4_clear_recover_bit(struct nfs_client *clp) { smp_mb__before_clear_bit(); clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state); @@ -764,25 +630,25 @@ static inline void nfs4_clear_recover_bit(struct nfs4_client *clp) /* * State recovery routine */ -static void nfs4_recover_state(struct nfs4_client *clp) +static void nfs4_recover_state(struct nfs_client *clp) { struct task_struct *task; __module_get(THIS_MODULE); atomic_inc(&clp->cl_count); task = kthread_run(reclaimer, clp, "%u.%u.%u.%u-reclaim", - NIPQUAD(clp->cl_addr)); + NIPQUAD(clp->cl_addr.sin_addr)); if (!IS_ERR(task)) return; nfs4_clear_recover_bit(clp); - nfs4_put_client(clp); + nfs_put_client(clp); module_put(THIS_MODULE); } /* * Schedule a state recovery attempt */ -void nfs4_schedule_state_recovery(struct nfs4_client *clp) +void nfs4_schedule_state_recovery(struct nfs_client *clp) { if (!clp) return; @@ -879,7 +745,7 @@ out_err: return status; } -static void nfs4_state_mark_reclaim(struct nfs4_client *clp) +static void nfs4_state_mark_reclaim(struct nfs_client *clp) { struct nfs4_state_owner *sp; struct nfs4_state *state; @@ -903,7 +769,7 @@ static void nfs4_state_mark_reclaim(struct nfs4_client *clp) static int reclaimer(void *ptr) { - struct nfs4_client *clp = ptr; + struct nfs_client *clp = ptr; struct nfs4_state_owner *sp; struct nfs4_state_recovery_ops *ops; struct rpc_cred *cred; @@ -970,12 +836,12 @@ out: if (status == -NFS4ERR_CB_PATH_DOWN) nfs_handle_cb_pathdown(clp); nfs4_clear_recover_bit(clp); - nfs4_put_client(clp); + nfs_put_client(clp); module_put_and_exit(0); return 0; out_error: printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n", - NIPQUAD(clp->cl_addr.s_addr), -status); + NIPQUAD(clp->cl_addr.sin_addr), -status); set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); goto out; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 730ec8fb31c6..3dd413f52da1 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -58,7 +58,7 @@ /* Mapping from NFS error code to "errno" error code. */ #define errno_NFSERR_IO EIO -static int nfs_stat_to_errno(int); +static int nfs4_stat_to_errno(int); /* NFSv4 COMPOUND tags are only wanted for debugging purposes */ #ifdef DEBUG @@ -128,7 +128,7 @@ static int nfs_stat_to_errno(int); #define decode_link_maxsz (op_decode_hdr_maxsz + 5) #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ 1 + nfs4_name_maxsz + \ - nfs4_path_maxsz + \ + 1 + \ nfs4_fattr_maxsz) #define decode_symlink_maxsz (op_decode_hdr_maxsz + 8) #define encode_create_maxsz (op_encode_hdr_maxsz + \ @@ -529,7 +529,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s if (iap->ia_valid & ATTR_MODE) len += 4; if (iap->ia_valid & ATTR_UID) { - owner_namelen = nfs_map_uid_to_name(server->nfs4_state, iap->ia_uid, owner_name); + owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name); if (owner_namelen < 0) { printk(KERN_WARNING "nfs: couldn't resolve uid %d to string\n", iap->ia_uid); @@ -541,7 +541,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s len += 4 + (XDR_QUADLEN(owner_namelen) << 2); } if (iap->ia_valid & ATTR_GID) { - owner_grouplen = nfs_map_gid_to_group(server->nfs4_state, iap->ia_gid, owner_group); + owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group); if (owner_grouplen < 0) { printk(KERN_WARNING "nfs4: couldn't resolve gid %d to string\n", iap->ia_gid); @@ -673,9 +673,9 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c switch (create->ftype) { case NF4LNK: - RESERVE_SPACE(4 + create->u.symlink->len); - WRITE32(create->u.symlink->len); - WRITEMEM(create->u.symlink->name, create->u.symlink->len); + RESERVE_SPACE(4); + WRITE32(create->u.symlink.len); + xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); break; case NF4BLK: case NF4CHR: @@ -1160,7 +1160,7 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con return 0; } -static int encode_renew(struct xdr_stream *xdr, const struct nfs4_client *client_stateid) +static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid) { uint32_t *p; @@ -1246,7 +1246,7 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien return 0; } -static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_client *client_state) +static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state) { uint32_t *p; @@ -1945,7 +1945,7 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, uint32_t *p, const str /* * a RENEW request */ -static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs4_client *clp) +static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp) { struct xdr_stream xdr; struct compound_hdr hdr = { @@ -1975,7 +1975,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, uint32_t *p, struct nf /* * a SETCLIENTID_CONFIRM request */ -static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs4_client *clp) +static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp) { struct xdr_stream xdr; struct compound_hdr hdr = { @@ -2127,12 +2127,12 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) } READ32(nfserr); if (nfserr != NFS_OK) - return -nfs_stat_to_errno(nfserr); + return -nfs4_stat_to_errno(nfserr); return 0; } /* Dummy routine */ -static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs4_client *clp) +static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp) { uint32_t *p; unsigned int strlen; @@ -2636,7 +2636,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t return 0; } -static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_client *clp, int32_t *uid) +static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *uid) { uint32_t len, *p; @@ -2660,7 +2660,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf return 0; } -static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_client *clp, int32_t *gid) +static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *gid) { uint32_t len, *p; @@ -3051,9 +3051,9 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons fattr->mode |= fmode; if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0) goto xdr_error; - if ((status = decode_attr_owner(xdr, bitmap, server->nfs4_state, &fattr->uid)) != 0) + if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0) goto xdr_error; - if ((status = decode_attr_group(xdr, bitmap, server->nfs4_state, &fattr->gid)) != 0) + if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0) goto xdr_error; if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0) goto xdr_error; @@ -3254,7 +3254,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) if (decode_space_limit(xdr, &res->maxsize) < 0) return -EIO; } - return decode_ace(xdr, NULL, res->server->nfs4_state); + return decode_ace(xdr, NULL, res->server->nfs_client); } static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) @@ -3565,7 +3565,7 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res) return 0; } -static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_client *clp) +static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) { uint32_t *p; uint32_t opnum; @@ -3598,7 +3598,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_client *clp) READ_BUF(len); return -NFSERR_CLID_INUSE; } else - return -nfs_stat_to_errno(nfserr); + return -nfs4_stat_to_errno(nfserr); return 0; } @@ -4256,7 +4256,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs_fsi if (!status) status = decode_fsinfo(&xdr, fsinfo); if (!status) - status = -nfs_stat_to_errno(hdr.status); + status = -nfs4_stat_to_errno(hdr.status); return status; } @@ -4335,7 +4335,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, uint32_t *p, void *dummy) * a SETCLIENTID request */ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p, - struct nfs4_client *clp) + struct nfs_client *clp) { struct xdr_stream xdr; struct compound_hdr hdr; @@ -4346,7 +4346,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p, if (!status) status = decode_setclientid(&xdr, clp); if (!status) - status = -nfs_stat_to_errno(hdr.status); + status = -nfs4_stat_to_errno(hdr.status); return status; } @@ -4368,7 +4368,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s if (!status) status = decode_fsinfo(&xdr, fsinfo); if (!status) - status = -nfs_stat_to_errno(hdr.status); + status = -nfs4_stat_to_errno(hdr.status); return status; } @@ -4521,7 +4521,7 @@ static struct { * This one is used jointly by NFSv2 and NFSv3. */ static int -nfs_stat_to_errno(int stat) +nfs4_stat_to_errno(int stat) { int i; for (i = 0; nfs_errtbl[i].stat != -1; i++) { diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 36e902a88ca1..829af323f288 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -392,7 +392,6 @@ int __init nfs_init_nfspagecache(void) void nfs_destroy_nfspagecache(void) { - if (kmem_cache_destroy(nfs_page_cachep)) - printk(KERN_INFO "nfs_page: not all structures were freed\n"); + kmem_cache_destroy(nfs_page_cachep); } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b3899ea3229e..4529cc4f3f8f 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -66,14 +66,14 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("%s: call getattr\n", __FUNCTION__); nfs_fattr_init(fattr); - status = rpc_call_sync(server->client_sys, &msg, 0); + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); if (status) return status; dprintk("%s: call statfs\n", __FUNCTION__); msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS]; msg.rpc_resp = &fsinfo; - status = rpc_call_sync(server->client_sys, &msg, 0); + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("%s: reply statfs: %d\n", __FUNCTION__, status); if (status) return status; @@ -352,7 +352,7 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr * { struct nfs_diropargs *arg; - arg = (struct nfs_diropargs *)kmalloc(sizeof(*arg), GFP_KERNEL); + arg = kmalloc(sizeof(*arg), GFP_KERNEL); if (!arg) return -ENOMEM; arg->fh = NFS_FH(dir->d_inode); @@ -425,16 +425,17 @@ nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name) } static int -nfs_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path, - struct iattr *sattr, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) +nfs_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, + unsigned int len, struct iattr *sattr) { + struct nfs_fh fhandle; + struct nfs_fattr fattr; struct nfs_symlinkargs arg = { .fromfh = NFS_FH(dir), - .fromname = name->name, - .fromlen = name->len, - .topath = path->name, - .tolen = path->len, + .fromname = dentry->d_name.name, + .fromlen = dentry->d_name.len, + .pages = &page, + .pathlen = len, .sattr = sattr }; struct rpc_message msg = { @@ -443,13 +444,25 @@ nfs_proc_symlink(struct inode *dir, struct qstr *name, struct qstr *path, }; int status; - if (path->len > NFS2_MAXPATHLEN) + if (len > NFS2_MAXPATHLEN) return -ENAMETOOLONG; - dprintk("NFS call symlink %s -> %s\n", name->name, path->name); - nfs_fattr_init(fattr); - fhandle->size = 0; + + dprintk("NFS call symlink %s\n", dentry->d_name.name); + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); + + /* + * V2 SYMLINK requests don't return any attributes. Setting the + * filehandle size to zero indicates to nfs_instantiate that it + * should fill in the data with a LOOKUP call on the wire. + */ + if (status == 0) { + nfs_fattr_init(&fattr); + fhandle.size = 0; + status = nfs_instantiate(dentry, &fhandle, &fattr); + } + dprintk("NFS reply symlink: %d\n", status); return status; } @@ -671,7 +684,7 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) } -struct nfs_rpc_ops nfs_v2_clientops = { +const struct nfs_rpc_ops nfs_v2_clientops = { .version = 2, /* protocol version */ .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs_dir_inode_operations, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f0aff824a291..c2e49c397a27 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -171,7 +171,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode, rdata->args.offset = page_offset(page) + rdata->args.pgbase; dprintk("NFS: nfs_proc_read(%s, (%s/%Ld), %Lu, %u)\n", - NFS_SERVER(inode)->hostname, + NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id, (long long)NFS_FILEID(inode), (unsigned long long)rdata->args.pgbase, @@ -568,8 +568,13 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) nfs_add_stats(data->inode, NFSIOS_SERVERREADBYTES, resp->count); - /* Is this a short read? */ - if (task->tk_status >= 0 && resp->count < argp->count && !resp->eof) { + if (task->tk_status < 0) { + if (task->tk_status == -ESTALE) { + set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode)); + nfs_mark_for_revalidate(data->inode); + } + } else if (resp->count < argp->count && !resp->eof) { + /* This is a short read! */ nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); /* Has the server at least made some progress? */ if (resp->count != 0) { @@ -616,6 +621,10 @@ int nfs_readpage(struct file *file, struct page *page) if (error) goto out_error; + error = -ESTALE; + if (NFS_STALE(inode)) + goto out_error; + if (file == NULL) { ctx = nfs_find_open_context(inode, NULL, FMODE_READ); if (ctx == NULL) @@ -678,7 +687,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, }; struct inode *inode = mapping->host; struct nfs_server *server = NFS_SERVER(inode); - int ret; + int ret = -ESTALE; dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", inode->i_sb->s_id, @@ -686,6 +695,9 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, nr_pages); nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); + if (NFS_STALE(inode)) + goto out; + if (filp == NULL) { desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); if (desc.ctx == NULL) @@ -701,6 +713,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, ret = err; } put_nfs_open_context(desc.ctx); +out: return ret; } @@ -724,6 +737,5 @@ int __init nfs_init_readpagecache(void) void nfs_destroy_readpagecache(void) { mempool_destroy(nfs_rdata_mempool); - if (kmem_cache_destroy(nfs_rdata_cachep)) - printk(KERN_INFO "nfs_read_data: not all structures were freed\n"); + kmem_cache_destroy(nfs_rdata_cachep); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e8a9bee74d9d..e8d40030cab4 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -13,6 +13,11 @@ * * Split from inode.c by David Howells <dhowells@redhat.com> * + * - superblocks are indexed on server only - all inodes, dentries, etc. associated with a + * particular server are held in the same superblock + * - NFS superblocks can have several effective roots to the dentry tree + * - directory type roots are spliced into the tree when a path from one root reaches the root + * of another (see nfs_lookup()) */ #include <linux/config.h> @@ -52,66 +57,12 @@ #define NFSDBG_FACILITY NFSDBG_VFS -/* Maximum number of readahead requests - * FIXME: this should really be a sysctl so that users may tune it to suit - * their needs. People that do NFS over a slow network, might for - * instance want to reduce it to something closer to 1 for improved - * interactive response. - */ -#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) - -/* - * RPC cruft for NFS - */ -static struct rpc_version * nfs_version[] = { - NULL, - NULL, - &nfs_version2, -#if defined(CONFIG_NFS_V3) - &nfs_version3, -#elif defined(CONFIG_NFS_V4) - NULL, -#endif -#if defined(CONFIG_NFS_V4) - &nfs_version4, -#endif -}; - -static struct rpc_program nfs_program = { - .name = "nfs", - .number = NFS_PROGRAM, - .nrvers = ARRAY_SIZE(nfs_version), - .version = nfs_version, - .stats = &nfs_rpcstat, - .pipe_dir_name = "/nfs", -}; - -struct rpc_stat nfs_rpcstat = { - .program = &nfs_program -}; - - -#ifdef CONFIG_NFS_V3_ACL -static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; -static struct rpc_version * nfsacl_version[] = { - [3] = &nfsacl_version3, -}; - -struct rpc_program nfsacl_program = { - .name = "nfsacl", - .number = NFS_ACL_PROGRAM, - .nrvers = ARRAY_SIZE(nfsacl_version), - .version = nfsacl_version, - .stats = &nfsacl_rpcstat, -}; -#endif /* CONFIG_NFS_V3_ACL */ - static void nfs_umount_begin(struct vfsmount *, int); static int nfs_statfs(struct dentry *, struct kstatfs *); static int nfs_show_options(struct seq_file *, struct vfsmount *); static int nfs_show_stats(struct seq_file *, struct vfsmount *); static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); -static int nfs_clone_nfs_sb(struct file_system_type *fs_type, +static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static void nfs_kill_super(struct super_block *); @@ -120,15 +71,15 @@ static struct file_system_type nfs_fs_type = { .name = "nfs", .get_sb = nfs_get_sb, .kill_sb = nfs_kill_super, - .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; -struct file_system_type clone_nfs_fs_type = { +struct file_system_type nfs_xdev_fs_type = { .owner = THIS_MODULE, .name = "nfs", - .get_sb = nfs_clone_nfs_sb, + .get_sb = nfs_xdev_get_sb, .kill_sb = nfs_kill_super, - .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; static struct super_operations nfs_sops = { @@ -145,10 +96,10 @@ static struct super_operations nfs_sops = { #ifdef CONFIG_NFS_V4 static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static int nfs_clone_nfs4_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static int nfs_referral_nfs4_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static int nfs4_xdev_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static int nfs4_referral_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static void nfs4_kill_super(struct super_block *sb); static struct file_system_type nfs4_fs_type = { @@ -156,23 +107,23 @@ static struct file_system_type nfs4_fs_type = { .name = "nfs4", .get_sb = nfs4_get_sb, .kill_sb = nfs4_kill_super, - .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; -struct file_system_type clone_nfs4_fs_type = { +struct file_system_type nfs4_xdev_fs_type = { .owner = THIS_MODULE, .name = "nfs4", - .get_sb = nfs_clone_nfs4_sb, + .get_sb = nfs4_xdev_get_sb, .kill_sb = nfs4_kill_super, - .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; -struct file_system_type nfs_referral_nfs4_fs_type = { +struct file_system_type nfs4_referral_fs_type = { .owner = THIS_MODULE, .name = "nfs4", - .get_sb = nfs_referral_nfs4_sb, + .get_sb = nfs4_referral_get_sb, .kill_sb = nfs4_kill_super, - .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, + .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; static struct super_operations nfs4_sops = { @@ -187,39 +138,7 @@ static struct super_operations nfs4_sops = { }; #endif -#ifdef CONFIG_NFS_V4 -static const int nfs_set_port_min = 0; -static const int nfs_set_port_max = 65535; - -static int param_set_port(const char *val, struct kernel_param *kp) -{ - char *endp; - int num = simple_strtol(val, &endp, 0); - if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) - return -EINVAL; - *((int *)kp->arg) = num; - return 0; -} - -module_param_call(callback_tcpport, param_set_port, param_get_int, - &nfs_callback_set_tcpport, 0644); -#endif - -#ifdef CONFIG_NFS_V4 -static int param_set_idmap_timeout(const char *val, struct kernel_param *kp) -{ - char *endp; - int num = simple_strtol(val, &endp, 0); - int jif = num * HZ; - if (endp == val || *endp || num < 0 || jif < num) - return -EINVAL; - *((int *)kp->arg) = jif; - return 0; -} - -module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int, - &nfs_idmap_cache_timeout, 0644); -#endif +static struct shrinker *acl_shrinker; /* * Register the NFS filesystems @@ -240,6 +159,7 @@ int __init register_nfs_fs(void) if (ret < 0) goto error_2; #endif + acl_shrinker = set_shrinker(DEFAULT_SEEKS, nfs_access_cache_shrinker); return 0; #ifdef CONFIG_NFS_V4 @@ -257,6 +177,8 @@ error_0: */ void __exit unregister_nfs_fs(void) { + if (acl_shrinker != NULL) + remove_shrinker(acl_shrinker); #ifdef CONFIG_NFS_V4 unregister_filesystem(&nfs4_fs_type); nfs_unregister_sysctl(); @@ -269,11 +191,10 @@ void __exit unregister_nfs_fs(void) */ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct super_block *sb = dentry->d_sb; - struct nfs_server *server = NFS_SB(sb); + struct nfs_server *server = NFS_SB(dentry->d_sb); unsigned char blockbits; unsigned long blockres; - struct nfs_fh *rootfh = NFS_FH(sb->s_root->d_inode); + struct nfs_fh *fh = NFS_FH(dentry->d_inode); struct nfs_fattr fattr; struct nfs_fsstat res = { .fattr = &fattr, @@ -282,7 +203,7 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) lock_kernel(); - error = server->rpc_ops->statfs(server, rootfh, &res); + error = server->nfs_client->rpc_ops->statfs(server, fh, &res); buf->f_type = NFS_SUPER_MAGIC; if (error < 0) goto out_err; @@ -292,7 +213,7 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) * case where f_frsize != f_bsize. Eventually we want to * report the value of wtmult in this field. */ - buf->f_frsize = sb->s_blocksize; + buf->f_frsize = dentry->d_sb->s_blocksize; /* * On most *nix systems, f_blocks, f_bfree, and f_bavail @@ -301,8 +222,8 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) * thus historically Linux's sys_statfs reports these * fields in units of f_bsize. */ - buf->f_bsize = sb->s_blocksize; - blockbits = sb->s_blocksize_bits; + buf->f_bsize = dentry->d_sb->s_blocksize; + blockbits = dentry->d_sb->s_blocksize_bits; blockres = (1 << blockbits) - 1; buf->f_blocks = (res.tbytes + blockres) >> blockbits; buf->f_bfree = (res.fbytes + blockres) >> blockbits; @@ -323,9 +244,12 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) } +/* + * Map the security flavour number to a name + */ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) { - static struct { + static const struct { rpc_authflavor_t flavour; const char *str; } sec_flavours[] = { @@ -356,10 +280,10 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) */ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults) { - static struct proc_nfs_info { + static const struct proc_nfs_info { int flag; - char *str; - char *nostr; + const char *str; + const char *nostr; } nfs_info[] = { { NFS_MOUNT_SOFT, ",soft", ",hard" }, { NFS_MOUNT_INTR, ",intr", "" }, @@ -369,11 +293,12 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, { NFS_MOUNT_NOACL, ",noacl", "" }, { 0, NULL, NULL } }; - struct proc_nfs_info *nfs_infop; + const struct proc_nfs_info *nfs_infop; + struct nfs_client *clp = nfss->nfs_client; char buf[12]; - char *proto; + const char *proto; - seq_printf(m, ",vers=%d", nfss->rpc_ops->version); + seq_printf(m, ",vers=%d", clp->rpc_ops->version); seq_printf(m, ",rsize=%d", nfss->rsize); seq_printf(m, ",wsize=%d", nfss->wsize); if (nfss->acregmin != 3*HZ || showdefaults) @@ -402,8 +327,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, proto = buf; } seq_printf(m, ",proto=%s", proto); - seq_printf(m, ",timeo=%lu", 10U * nfss->retrans_timeo / HZ); - seq_printf(m, ",retrans=%u", nfss->retrans_count); + seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ); + seq_printf(m, ",retrans=%u", clp->retrans_count); seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); } @@ -417,7 +342,7 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) nfs_show_mount_options(m, nfss, 0); seq_puts(m, ",addr="); - seq_escape(m, nfss->hostname, " \t\n\\"); + seq_escape(m, nfss->nfs_client->cl_hostname, " \t\n\\"); return 0; } @@ -454,7 +379,7 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",namelen=%d", nfss->namelen); #ifdef CONFIG_NFS_V4 - if (nfss->rpc_ops->version == 4) { + if (nfss->nfs_client->cl_nfsversion == 4) { seq_printf(m, "\n\tnfsv4:\t"); seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); @@ -501,782 +426,353 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) /* * Begin unmount by attempting to remove all automounted mountpoints we added - * in response to traversals + * in response to xdev traversals and referrals */ static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags) { - struct nfs_server *server; - struct rpc_clnt *rpc; - shrink_submounts(vfsmnt, &nfs_automount_list); - if (!(flags & MNT_FORCE)) - return; - /* -EIO all pending I/O */ - server = NFS_SB(vfsmnt->mnt_sb); - rpc = server->client; - if (!IS_ERR(rpc)) - rpc_killall_tasks(rpc); - rpc = server->client_acl; - if (!IS_ERR(rpc)) - rpc_killall_tasks(rpc); } /* - * Obtain the root inode of the file system. + * Validate the NFS2/NFS3 mount data + * - fills in the mount root filehandle */ -static struct inode * -nfs_get_root(struct super_block *sb, struct nfs_fh *rootfh, struct nfs_fsinfo *fsinfo) +static int nfs_validate_mount_data(struct nfs_mount_data *data, + struct nfs_fh *mntfh) { - struct nfs_server *server = NFS_SB(sb); - int error; - - error = server->rpc_ops->getroot(server, rootfh, fsinfo); - if (error < 0) { - dprintk("nfs_get_root: getattr error = %d\n", -error); - return ERR_PTR(error); + if (data == NULL) { + dprintk("%s: missing data argument\n", __FUNCTION__); + return -EINVAL; } - server->fsid = fsinfo->fattr->fsid; - return nfs_fhget(sb, rootfh, fsinfo->fattr); -} - -/* - * Do NFS version-independent mount processing, and sanity checking - */ -static int -nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor) -{ - struct nfs_server *server; - struct inode *root_inode; - struct nfs_fattr fattr; - struct nfs_fsinfo fsinfo = { - .fattr = &fattr, - }; - struct nfs_pathconf pathinfo = { - .fattr = &fattr, - }; - int no_root_error = 0; - unsigned long max_rpc_payload; - - /* We probably want something more informative here */ - snprintf(sb->s_id, sizeof(sb->s_id), "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev)); - - server = NFS_SB(sb); + if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) { + dprintk("%s: bad mount version\n", __FUNCTION__); + return -EINVAL; + } - sb->s_magic = NFS_SUPER_MAGIC; + switch (data->version) { + case 1: + data->namlen = 0; + case 2: + data->bsize = 0; + case 3: + if (data->flags & NFS_MOUNT_VER3) { + dprintk("%s: mount structure version %d does not support NFSv3\n", + __FUNCTION__, + data->version); + return -EINVAL; + } + data->root.size = NFS2_FHSIZE; + memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); + case 4: + if (data->flags & NFS_MOUNT_SECFLAVOUR) { + dprintk("%s: mount structure version %d does not support strong security\n", + __FUNCTION__, + data->version); + return -EINVAL; + } + case 5: + memset(data->context, 0, sizeof(data->context)); + } - server->io_stats = nfs_alloc_iostats(); - if (server->io_stats == NULL) - return -ENOMEM; + /* Set the pseudoflavor */ + if (!(data->flags & NFS_MOUNT_SECFLAVOUR)) + data->pseudoflavor = RPC_AUTH_UNIX; - root_inode = nfs_get_root(sb, &server->fh, &fsinfo); - /* Did getting the root inode fail? */ - if (IS_ERR(root_inode)) { - no_root_error = PTR_ERR(root_inode); - goto out_no_root; - } - sb->s_root = d_alloc_root(root_inode); - if (!sb->s_root) { - no_root_error = -ENOMEM; - goto out_no_root; +#ifndef CONFIG_NFS_V3 + /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */ + if (data->flags & NFS_MOUNT_VER3) { + dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__); + return -EPROTONOSUPPORT; } - sb->s_root->d_op = server->rpc_ops->dentry_ops; - - /* mount time stamp, in seconds */ - server->mount_time = jiffies; - - /* Get some general file system info */ - if (server->namelen == 0 && - server->rpc_ops->pathconf(server, &server->fh, &pathinfo) >= 0) - server->namelen = pathinfo.max_namelen; - /* Work out a lot of parameters */ - if (server->rsize == 0) - server->rsize = nfs_block_size(fsinfo.rtpref, NULL); - if (server->wsize == 0) - server->wsize = nfs_block_size(fsinfo.wtpref, NULL); - - if (fsinfo.rtmax >= 512 && server->rsize > fsinfo.rtmax) - server->rsize = nfs_block_size(fsinfo.rtmax, NULL); - if (fsinfo.wtmax >= 512 && server->wsize > fsinfo.wtmax) - server->wsize = nfs_block_size(fsinfo.wtmax, NULL); - - max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL); - if (server->rsize > max_rpc_payload) - server->rsize = max_rpc_payload; - if (server->rsize > NFS_MAX_FILE_IO_SIZE) - server->rsize = NFS_MAX_FILE_IO_SIZE; - server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - if (server->wsize > max_rpc_payload) - server->wsize = max_rpc_payload; - if (server->wsize > NFS_MAX_FILE_IO_SIZE) - server->wsize = NFS_MAX_FILE_IO_SIZE; - server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; +#endif /* CONFIG_NFS_V3 */ - if (sb->s_blocksize == 0) - sb->s_blocksize = nfs_block_bits(server->wsize, - &sb->s_blocksize_bits); - server->wtmult = nfs_block_bits(fsinfo.wtmult, NULL); - - server->dtsize = nfs_block_size(fsinfo.dtpref, NULL); - if (server->dtsize > PAGE_CACHE_SIZE) - server->dtsize = PAGE_CACHE_SIZE; - if (server->dtsize > server->rsize) - server->dtsize = server->rsize; - - if (server->flags & NFS_MOUNT_NOAC) { - server->acregmin = server->acregmax = 0; - server->acdirmin = server->acdirmax = 0; - sb->s_flags |= MS_SYNCHRONOUS; + /* We now require that the mount process passes the remote address */ + if (data->addr.sin_addr.s_addr == INADDR_ANY) { + dprintk("%s: mount program didn't pass remote address!\n", + __FUNCTION__); + return -EINVAL; } - server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; - nfs_super_set_maxbytes(sb, fsinfo.maxfilesize); + /* Prepare the root filehandle */ + if (data->flags & NFS_MOUNT_VER3) + mntfh->size = data->root.size; + else + mntfh->size = NFS2_FHSIZE; + + if (mntfh->size > sizeof(mntfh->data)) { + dprintk("%s: invalid root filehandle\n", __FUNCTION__); + return -EINVAL; + } - server->client->cl_intr = (server->flags & NFS_MOUNT_INTR) ? 1 : 0; - server->client->cl_softrtry = (server->flags & NFS_MOUNT_SOFT) ? 1 : 0; + memcpy(mntfh->data, data->root.data, mntfh->size); + if (mntfh->size < sizeof(mntfh->data)) + memset(mntfh->data + mntfh->size, 0, + sizeof(mntfh->data) - mntfh->size); - /* We're airborne Set socket buffersize */ - rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100); return 0; - /* Yargs. It didn't work out. */ -out_no_root: - dprintk("nfs_sb_init: get root inode failed: errno %d\n", -no_root_error); - if (!IS_ERR(root_inode)) - iput(root_inode); - return no_root_error; } /* - * Initialise the timeout values for a connection + * Initialise the common bits of the superblock */ -static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned int timeo, unsigned int retrans) +static inline void nfs_initialise_sb(struct super_block *sb) { - to->to_initval = timeo * HZ / 10; - to->to_retries = retrans; - if (!to->to_retries) - to->to_retries = 2; - - switch (proto) { - case IPPROTO_TCP: - if (!to->to_initval) - to->to_initval = 60 * HZ; - if (to->to_initval > NFS_MAX_TCP_TIMEOUT) - to->to_initval = NFS_MAX_TCP_TIMEOUT; - to->to_increment = to->to_initval; - to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); - to->to_exponential = 0; - break; - case IPPROTO_UDP: - default: - if (!to->to_initval) - to->to_initval = 11 * HZ / 10; - if (to->to_initval > NFS_MAX_UDP_TIMEOUT) - to->to_initval = NFS_MAX_UDP_TIMEOUT; - to->to_maxval = NFS_MAX_UDP_TIMEOUT; - to->to_exponential = 1; - break; - } -} + struct nfs_server *server = NFS_SB(sb); -/* - * Create an RPC client handle. - */ -static struct rpc_clnt * -nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data) -{ - struct rpc_timeout timeparms; - struct rpc_xprt *xprt = NULL; - struct rpc_clnt *clnt = NULL; - int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP; - - nfs_init_timeout_values(&timeparms, proto, data->timeo, data->retrans); - - server->retrans_timeo = timeparms.to_initval; - server->retrans_count = timeparms.to_retries; - - /* create transport and client */ - xprt = xprt_create_proto(proto, &server->addr, &timeparms); - if (IS_ERR(xprt)) { - dprintk("%s: cannot create RPC transport. Error = %ld\n", - __FUNCTION__, PTR_ERR(xprt)); - return (struct rpc_clnt *)xprt; - } - clnt = rpc_create_client(xprt, server->hostname, &nfs_program, - server->rpc_ops->version, data->pseudoflavor); - if (IS_ERR(clnt)) { - dprintk("%s: cannot create RPC client. Error = %ld\n", - __FUNCTION__, PTR_ERR(xprt)); - goto out_fail; - } + sb->s_magic = NFS_SUPER_MAGIC; - clnt->cl_intr = 1; - clnt->cl_softrtry = 1; + /* We probably want something more informative here */ + snprintf(sb->s_id, sizeof(sb->s_id), + "%x:%x", MAJOR(sb->s_dev), MINOR(sb->s_dev)); + + if (sb->s_blocksize == 0) + sb->s_blocksize = nfs_block_bits(server->wsize, + &sb->s_blocksize_bits); - return clnt; + if (server->flags & NFS_MOUNT_NOAC) + sb->s_flags |= MS_SYNCHRONOUS; -out_fail: - return clnt; + nfs_super_set_maxbytes(sb, server->maxfilesize); } /* - * Clone a server record + * Finish setting up an NFS2/3 superblock */ -static struct nfs_server *nfs_clone_server(struct super_block *sb, struct nfs_clone_mount *data) +static void nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data) { struct nfs_server *server = NFS_SB(sb); - struct nfs_server *parent = NFS_SB(data->sb); - struct inode *root_inode; - struct nfs_fsinfo fsinfo; - void *err = ERR_PTR(-ENOMEM); - - sb->s_op = data->sb->s_op; - sb->s_blocksize = data->sb->s_blocksize; - sb->s_blocksize_bits = data->sb->s_blocksize_bits; - sb->s_maxbytes = data->sb->s_maxbytes; - - server->client_sys = server->client_acl = ERR_PTR(-EINVAL); - server->io_stats = nfs_alloc_iostats(); - if (server->io_stats == NULL) - goto out; - - server->client = rpc_clone_client(parent->client); - if (IS_ERR((err = server->client))) - goto out; - - if (!IS_ERR(parent->client_sys)) { - server->client_sys = rpc_clone_client(parent->client_sys); - if (IS_ERR((err = server->client_sys))) - goto out; - } - if (!IS_ERR(parent->client_acl)) { - server->client_acl = rpc_clone_client(parent->client_acl); - if (IS_ERR((err = server->client_acl))) - goto out; - } - root_inode = nfs_fhget(sb, data->fh, data->fattr); - if (!root_inode) - goto out; - sb->s_root = d_alloc_root(root_inode); - if (!sb->s_root) - goto out_put_root; - fsinfo.fattr = data->fattr; - if (NFS_PROTO(root_inode)->fsinfo(server, data->fh, &fsinfo) == 0) - nfs_super_set_maxbytes(sb, fsinfo.maxfilesize); - sb->s_root->d_op = server->rpc_ops->dentry_ops; - sb->s_flags |= MS_ACTIVE; - return server; -out_put_root: - iput(root_inode); -out: - return err; -} -/* - * Copy an existing superblock and attach revised data - */ -static int nfs_clone_generic_sb(struct nfs_clone_mount *data, - struct super_block *(*fill_sb)(struct nfs_server *, struct nfs_clone_mount *), - struct nfs_server *(*fill_server)(struct super_block *, struct nfs_clone_mount *), - struct vfsmount *mnt) -{ - struct nfs_server *server; - struct nfs_server *parent = NFS_SB(data->sb); - struct super_block *sb = ERR_PTR(-EINVAL); - char *hostname; - int error = -ENOMEM; - int len; - - server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL); - if (server == NULL) - goto out_err; - memcpy(server, parent, sizeof(*server)); - hostname = (data->hostname != NULL) ? data->hostname : parent->hostname; - len = strlen(hostname) + 1; - server->hostname = kmalloc(len, GFP_KERNEL); - if (server->hostname == NULL) - goto free_server; - memcpy(server->hostname, hostname, len); - error = rpciod_up(); - if (error != 0) - goto free_hostname; - - sb = fill_sb(server, data); - if (IS_ERR(sb)) { - error = PTR_ERR(sb); - goto kill_rpciod; - } - - if (sb->s_root) - goto out_rpciod_down; + sb->s_blocksize_bits = 0; + sb->s_blocksize = 0; + if (data->bsize) + sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); - server = fill_server(sb, data); - if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out_deactivate; + if (server->flags & NFS_MOUNT_VER3) { + /* The VFS shouldn't apply the umask to mode bits. We will do + * so ourselves when necessary. + */ + sb->s_flags |= MS_POSIXACL; + sb->s_time_gran = 1; } - return simple_set_mnt(mnt, sb); -out_deactivate: - up_write(&sb->s_umount); - deactivate_super(sb); - return error; -out_rpciod_down: - rpciod_down(); - kfree(server->hostname); - kfree(server); - return simple_set_mnt(mnt, sb); -kill_rpciod: - rpciod_down(); -free_hostname: - kfree(server->hostname); -free_server: - kfree(server); -out_err: - return error; + + sb->s_op = &nfs_sops; + nfs_initialise_sb(sb); } /* - * Set up an NFS2/3 superblock - * - * The way this works is that the mount process passes a structure - * in the data argument which contains the server's IP address - * and the root file handle obtained from the server's mount - * daemon. We stash these away in the private superblock fields. + * Finish setting up a cloned NFS2/3 superblock */ -static int -nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent) +static void nfs_clone_super(struct super_block *sb, + const struct super_block *old_sb) { - struct nfs_server *server; - rpc_authflavor_t authflavor; + struct nfs_server *server = NFS_SB(sb); - server = NFS_SB(sb); - sb->s_blocksize_bits = 0; - sb->s_blocksize = 0; - if (data->bsize) - sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); - if (data->rsize) - server->rsize = nfs_block_size(data->rsize, NULL); - if (data->wsize) - server->wsize = nfs_block_size(data->wsize, NULL); - server->flags = data->flags & NFS_MOUNT_FLAGMASK; - - server->acregmin = data->acregmin*HZ; - server->acregmax = data->acregmax*HZ; - server->acdirmin = data->acdirmin*HZ; - server->acdirmax = data->acdirmax*HZ; - - /* Start lockd here, before we might error out */ - if (!(server->flags & NFS_MOUNT_NONLM)) - lockd_up(); - - server->namelen = data->namlen; - server->hostname = kmalloc(strlen(data->hostname) + 1, GFP_KERNEL); - if (!server->hostname) - return -ENOMEM; - strcpy(server->hostname, data->hostname); - - /* Check NFS protocol revision and initialize RPC op vector - * and file handle pool. */ -#ifdef CONFIG_NFS_V3 - if (server->flags & NFS_MOUNT_VER3) { - server->rpc_ops = &nfs_v3_clientops; - server->caps |= NFS_CAP_READDIRPLUS; - } else { - server->rpc_ops = &nfs_v2_clientops; - } -#else - server->rpc_ops = &nfs_v2_clientops; -#endif + sb->s_blocksize_bits = old_sb->s_blocksize_bits; + sb->s_blocksize = old_sb->s_blocksize; + sb->s_maxbytes = old_sb->s_maxbytes; - /* Fill in pseudoflavor for mount version < 5 */ - if (!(data->flags & NFS_MOUNT_SECFLAVOUR)) - data->pseudoflavor = RPC_AUTH_UNIX; - authflavor = data->pseudoflavor; /* save for sb_init() */ - /* XXX maybe we want to add a server->pseudoflavor field */ - - /* Create RPC client handles */ - server->client = nfs_create_client(server, data); - if (IS_ERR(server->client)) - return PTR_ERR(server->client); - /* RFC 2623, sec 2.3.2 */ - if (authflavor != RPC_AUTH_UNIX) { - struct rpc_auth *auth; - - server->client_sys = rpc_clone_client(server->client); - if (IS_ERR(server->client_sys)) - return PTR_ERR(server->client_sys); - auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys); - if (IS_ERR(auth)) - return PTR_ERR(auth); - } else { - atomic_inc(&server->client->cl_count); - server->client_sys = server->client; - } if (server->flags & NFS_MOUNT_VER3) { -#ifdef CONFIG_NFS_V3_ACL - if (!(server->flags & NFS_MOUNT_NOACL)) { - server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); - /* No errors! Assume that Sun nfsacls are supported */ - if (!IS_ERR(server->client_acl)) - server->caps |= NFS_CAP_ACLS; - } -#else - server->flags &= ~NFS_MOUNT_NOACL; -#endif /* CONFIG_NFS_V3_ACL */ - /* - * The VFS shouldn't apply the umask to mode bits. We will - * do so ourselves when necessary. + /* The VFS shouldn't apply the umask to mode bits. We will do + * so ourselves when necessary. */ sb->s_flags |= MS_POSIXACL; - if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) - server->namelen = NFS3_MAXNAMLEN; sb->s_time_gran = 1; - } else { - if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) - server->namelen = NFS2_MAXNAMLEN; } - sb->s_op = &nfs_sops; - return nfs_sb_init(sb, authflavor); + sb->s_op = old_sb->s_op; + nfs_initialise_sb(sb); } -static int nfs_set_super(struct super_block *s, void *data) +static int nfs_set_super(struct super_block *s, void *_server) { - s->s_fs_info = data; - return set_anon_super(s, data); + struct nfs_server *server = _server; + int ret; + + s->s_fs_info = server; + ret = set_anon_super(s, server); + if (ret == 0) + server->s_dev = s->s_dev; + return ret; } static int nfs_compare_super(struct super_block *sb, void *data) { - struct nfs_server *server = data; - struct nfs_server *old = NFS_SB(sb); + struct nfs_server *server = data, *old = NFS_SB(sb); - if (old->addr.sin_addr.s_addr != server->addr.sin_addr.s_addr) + if (old->nfs_client != server->nfs_client) return 0; - if (old->addr.sin_port != server->addr.sin_port) + if (memcmp(&old->fsid, &server->fsid, sizeof(old->fsid)) != 0) return 0; - return !nfs_compare_fh(&old->fh, &server->fh); + return 1; } static int nfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { - int error; struct nfs_server *server = NULL; struct super_block *s; - struct nfs_fh *root; + struct nfs_fh mntfh; struct nfs_mount_data *data = raw_data; + struct dentry *mntroot; + int error; - error = -EINVAL; - if (data == NULL) { - dprintk("%s: missing data argument\n", __FUNCTION__); - goto out_err_noserver; - } - if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) { - dprintk("%s: bad mount version\n", __FUNCTION__); - goto out_err_noserver; - } - switch (data->version) { - case 1: - data->namlen = 0; - case 2: - data->bsize = 0; - case 3: - if (data->flags & NFS_MOUNT_VER3) { - dprintk("%s: mount structure version %d does not support NFSv3\n", - __FUNCTION__, - data->version); - goto out_err_noserver; - } - data->root.size = NFS2_FHSIZE; - memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); - case 4: - if (data->flags & NFS_MOUNT_SECFLAVOUR) { - dprintk("%s: mount structure version %d does not support strong security\n", - __FUNCTION__, - data->version); - goto out_err_noserver; - } - case 5: - memset(data->context, 0, sizeof(data->context)); - } -#ifndef CONFIG_NFS_V3 - /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */ - error = -EPROTONOSUPPORT; - if (data->flags & NFS_MOUNT_VER3) { - dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__); - goto out_err_noserver; - } -#endif /* CONFIG_NFS_V3 */ + /* Validate the mount data */ + error = nfs_validate_mount_data(data, &mntfh); + if (error < 0) + return error; - error = -ENOMEM; - server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); - if (!server) + /* Get a volume representation */ + server = nfs_create_server(data, &mntfh); + if (IS_ERR(server)) { + error = PTR_ERR(server); goto out_err_noserver; - /* Zero out the NFS state stuff */ - init_nfsv4_state(server); - server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL); - - root = &server->fh; - if (data->flags & NFS_MOUNT_VER3) - root->size = data->root.size; - else - root->size = NFS2_FHSIZE; - error = -EINVAL; - if (root->size > sizeof(root->data)) { - dprintk("%s: invalid root filehandle\n", __FUNCTION__); - goto out_err; - } - memcpy(root->data, data->root.data, root->size); - - /* We now require that the mount process passes the remote address */ - memcpy(&server->addr, &data->addr, sizeof(server->addr)); - if (server->addr.sin_addr.s_addr == INADDR_ANY) { - dprintk("%s: mount program didn't pass remote address!\n", - __FUNCTION__); - goto out_err; - } - - /* Fire up rpciod if not yet running */ - error = rpciod_up(); - if (error < 0) { - dprintk("%s: couldn't start rpciod! Error = %d\n", - __FUNCTION__, error); - goto out_err; } + /* Get a superblock - note that we may end up sharing one that already exists */ s = sget(fs_type, nfs_compare_super, nfs_set_super, server); if (IS_ERR(s)) { error = PTR_ERR(s); - goto out_err_rpciod; + goto out_err_nosb; } - if (s->s_root) - goto out_rpciod_down; + if (s->s_fs_info != server) { + nfs_free_server(server); + server = NULL; + } - s->s_flags = flags; + if (!s->s_root) { + /* initial superblock/root creation */ + s->s_flags = flags; + nfs_fill_super(s, data); + } - error = nfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0); - if (error) { - up_write(&s->s_umount); - deactivate_super(s); - return error; + mntroot = nfs_get_root(s, &mntfh); + if (IS_ERR(mntroot)) { + error = PTR_ERR(mntroot); + goto error_splat_super; } - s->s_flags |= MS_ACTIVE; - return simple_set_mnt(mnt, s); -out_rpciod_down: - rpciod_down(); - kfree(server); - return simple_set_mnt(mnt, s); + s->s_flags |= MS_ACTIVE; + mnt->mnt_sb = s; + mnt->mnt_root = mntroot; + return 0; -out_err_rpciod: - rpciod_down(); -out_err: - kfree(server); +out_err_nosb: + nfs_free_server(server); out_err_noserver: return error; + +error_splat_super: + up_write(&s->s_umount); + deactivate_super(s); + return error; } +/* + * Destroy an NFS2/3 superblock + */ static void nfs_kill_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); kill_anon_super(s); - - if (!IS_ERR(server->client)) - rpc_shutdown_client(server->client); - if (!IS_ERR(server->client_sys)) - rpc_shutdown_client(server->client_sys); - if (!IS_ERR(server->client_acl)) - rpc_shutdown_client(server->client_acl); - - if (!(server->flags & NFS_MOUNT_NONLM)) - lockd_down(); /* release rpc.lockd */ - - rpciod_down(); /* release rpciod */ - - nfs_free_iostats(server->io_stats); - kfree(server->hostname); - kfree(server); - nfs_release_automount_timer(); -} - -static struct super_block *nfs_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data) -{ - struct super_block *sb; - - server->fsid = data->fattr->fsid; - nfs_copy_fh(&server->fh, data->fh); - sb = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server); - if (!IS_ERR(sb) && sb->s_root == NULL && !(server->flags & NFS_MOUNT_NONLM)) - lockd_up(); - return sb; + nfs_free_server(server); } -static int nfs_clone_nfs_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) +/* + * Clone an NFS2/3 server record on xdev traversal (FSID-change) + */ +static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data, + struct vfsmount *mnt) { struct nfs_clone_mount *data = raw_data; - return nfs_clone_generic_sb(data, nfs_clone_sb, nfs_clone_server, mnt); -} + struct super_block *s; + struct nfs_server *server; + struct dentry *mntroot; + int error; -#ifdef CONFIG_NFS_V4 -static struct rpc_clnt *nfs4_create_client(struct nfs_server *server, - struct rpc_timeout *timeparms, int proto, rpc_authflavor_t flavor) -{ - struct nfs4_client *clp; - struct rpc_xprt *xprt = NULL; - struct rpc_clnt *clnt = NULL; - int err = -EIO; - - clp = nfs4_get_client(&server->addr.sin_addr); - if (!clp) { - dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__); - return ERR_PTR(err); - } + dprintk("--> nfs_xdev_get_sb()\n"); - /* Now create transport and client */ - down_write(&clp->cl_sem); - if (IS_ERR(clp->cl_rpcclient)) { - xprt = xprt_create_proto(proto, &server->addr, timeparms); - if (IS_ERR(xprt)) { - up_write(&clp->cl_sem); - err = PTR_ERR(xprt); - dprintk("%s: cannot create RPC transport. Error = %d\n", - __FUNCTION__, err); - goto out_fail; - } - /* Bind to a reserved port! */ - xprt->resvport = 1; - clnt = rpc_create_client(xprt, server->hostname, &nfs_program, - server->rpc_ops->version, flavor); - if (IS_ERR(clnt)) { - up_write(&clp->cl_sem); - err = PTR_ERR(clnt); - dprintk("%s: cannot create RPC client. Error = %d\n", - __FUNCTION__, err); - goto out_fail; - } - clnt->cl_intr = 1; - clnt->cl_softrtry = 1; - clp->cl_rpcclient = clnt; - memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr)); - nfs_idmap_new(clp); - } - list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks); - clnt = rpc_clone_client(clp->cl_rpcclient); - if (!IS_ERR(clnt)) - server->nfs4_state = clp; - up_write(&clp->cl_sem); - clp = NULL; - - if (IS_ERR(clnt)) { - dprintk("%s: cannot create RPC client. Error = %d\n", - __FUNCTION__, err); - return clnt; + /* create a new volume representation */ + server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); + if (IS_ERR(server)) { + error = PTR_ERR(server); + goto out_err_noserver; } - if (server->nfs4_state->cl_idmap == NULL) { - dprintk("%s: failed to create idmapper.\n", __FUNCTION__); - return ERR_PTR(-ENOMEM); + /* Get a superblock - note that we may end up sharing one that already exists */ + s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto out_err_nosb; } - if (clnt->cl_auth->au_flavor != flavor) { - struct rpc_auth *auth; - - auth = rpcauth_create(flavor, clnt); - if (IS_ERR(auth)) { - dprintk("%s: couldn't create credcache!\n", __FUNCTION__); - return (struct rpc_clnt *)auth; - } + if (s->s_fs_info != server) { + nfs_free_server(server); + server = NULL; } - return clnt; - - out_fail: - if (clp) - nfs4_put_client(clp); - return ERR_PTR(err); -} - -/* - * Set up an NFS4 superblock - */ -static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, int silent) -{ - struct nfs_server *server; - struct rpc_timeout timeparms; - rpc_authflavor_t authflavour; - int err = -EIO; - sb->s_blocksize_bits = 0; - sb->s_blocksize = 0; - server = NFS_SB(sb); - if (data->rsize != 0) - server->rsize = nfs_block_size(data->rsize, NULL); - if (data->wsize != 0) - server->wsize = nfs_block_size(data->wsize, NULL); - server->flags = data->flags & NFS_MOUNT_FLAGMASK; - server->caps = NFS_CAP_ATOMIC_OPEN; + if (!s->s_root) { + /* initial superblock/root creation */ + s->s_flags = flags; + nfs_clone_super(s, data->sb); + } - server->acregmin = data->acregmin*HZ; - server->acregmax = data->acregmax*HZ; - server->acdirmin = data->acdirmin*HZ; - server->acdirmax = data->acdirmax*HZ; + mntroot = nfs_get_root(s, data->fh); + if (IS_ERR(mntroot)) { + error = PTR_ERR(mntroot); + goto error_splat_super; + } - server->rpc_ops = &nfs_v4_clientops; + s->s_flags |= MS_ACTIVE; + mnt->mnt_sb = s; + mnt->mnt_root = mntroot; - nfs_init_timeout_values(&timeparms, data->proto, data->timeo, data->retrans); + dprintk("<-- nfs_xdev_get_sb() = 0\n"); + return 0; - server->retrans_timeo = timeparms.to_initval; - server->retrans_count = timeparms.to_retries; +out_err_nosb: + nfs_free_server(server); +out_err_noserver: + dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error); + return error; - /* Now create transport and client */ - authflavour = RPC_AUTH_UNIX; - if (data->auth_flavourlen != 0) { - if (data->auth_flavourlen != 1) { - dprintk("%s: Invalid number of RPC auth flavours %d.\n", - __FUNCTION__, data->auth_flavourlen); - err = -EINVAL; - goto out_fail; - } - if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) { - err = -EFAULT; - goto out_fail; - } - } +error_splat_super: + up_write(&s->s_umount); + deactivate_super(s); + dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); + return error; +} - server->client = nfs4_create_client(server, &timeparms, data->proto, authflavour); - if (IS_ERR(server->client)) { - err = PTR_ERR(server->client); - dprintk("%s: cannot create RPC client. Error = %d\n", - __FUNCTION__, err); - goto out_fail; - } +#ifdef CONFIG_NFS_V4 +/* + * Finish setting up a cloned NFS4 superblock + */ +static void nfs4_clone_super(struct super_block *sb, + const struct super_block *old_sb) +{ + sb->s_blocksize_bits = old_sb->s_blocksize_bits; + sb->s_blocksize = old_sb->s_blocksize; + sb->s_maxbytes = old_sb->s_maxbytes; sb->s_time_gran = 1; - - sb->s_op = &nfs4_sops; - err = nfs_sb_init(sb, authflavour); - - out_fail: - return err; + sb->s_op = old_sb->s_op; + nfs_initialise_sb(sb); } -static int nfs4_compare_super(struct super_block *sb, void *data) +/* + * Set up an NFS4 superblock + */ +static void nfs4_fill_super(struct super_block *sb) { - struct nfs_server *server = data; - struct nfs_server *old = NFS_SB(sb); - - if (strcmp(server->hostname, old->hostname) != 0) - return 0; - if (strcmp(server->mnt_path, old->mnt_path) != 0) - return 0; - return 1; + sb->s_time_gran = 1; + sb->s_op = &nfs4_sops; + nfs_initialise_sb(sb); } -static void * -nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen) +static void *nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen) { void *p = NULL; @@ -1297,14 +793,22 @@ nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen) return dst; } +/* + * Get the superblock for an NFS4 mountpoint + */ static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { - int error; - struct nfs_server *server; - struct super_block *s; struct nfs4_mount_data *data = raw_data; + struct super_block *s; + struct nfs_server *server; + struct sockaddr_in addr; + rpc_authflavor_t authflavour; + struct nfs_fh mntfh; + struct dentry *mntroot; + char *mntpath = NULL, *hostname = NULL, ip_addr[16]; void *p; + int error; if (data == NULL) { dprintk("%s: missing data argument\n", __FUNCTION__); @@ -1315,84 +819,112 @@ static int nfs4_get_sb(struct file_system_type *fs_type, return -EINVAL; } - server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); - if (!server) - return -ENOMEM; - /* Zero out the NFS state stuff */ - init_nfsv4_state(server); - server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL); + /* We now require that the mount process passes the remote address */ + if (data->host_addrlen != sizeof(addr)) + return -EINVAL; + + if (copy_from_user(&addr, data->host_addr, sizeof(addr))) + return -EFAULT; + + if (addr.sin_family != AF_INET || + addr.sin_addr.s_addr == INADDR_ANY + ) { + dprintk("%s: mount program didn't pass remote IP address!\n", + __FUNCTION__); + return -EINVAL; + } + /* RFC3530: The default port for NFS is 2049 */ + if (addr.sin_port == 0) + addr.sin_port = NFS_PORT; + + /* Grab the authentication type */ + authflavour = RPC_AUTH_UNIX; + if (data->auth_flavourlen != 0) { + if (data->auth_flavourlen != 1) { + dprintk("%s: Invalid number of RPC auth flavours %d.\n", + __FUNCTION__, data->auth_flavourlen); + error = -EINVAL; + goto out_err_noserver; + } + + if (copy_from_user(&authflavour, data->auth_flavours, + sizeof(authflavour))) { + error = -EFAULT; + goto out_err_noserver; + } + } p = nfs_copy_user_string(NULL, &data->hostname, 256); if (IS_ERR(p)) goto out_err; - server->hostname = p; + hostname = p; p = nfs_copy_user_string(NULL, &data->mnt_path, 1024); if (IS_ERR(p)) goto out_err; - server->mnt_path = p; + mntpath = p; + + dprintk("MNTPATH: %s\n", mntpath); - p = nfs_copy_user_string(server->ip_addr, &data->client_addr, - sizeof(server->ip_addr) - 1); + p = nfs_copy_user_string(ip_addr, &data->client_addr, + sizeof(ip_addr) - 1); if (IS_ERR(p)) goto out_err; - /* We now require that the mount process passes the remote address */ - if (data->host_addrlen != sizeof(server->addr)) { - error = -EINVAL; - goto out_free; - } - if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) { - error = -EFAULT; - goto out_free; - } - if (server->addr.sin_family != AF_INET || - server->addr.sin_addr.s_addr == INADDR_ANY) { - dprintk("%s: mount program didn't pass remote IP address!\n", - __FUNCTION__); - error = -EINVAL; - goto out_free; - } - - /* Fire up rpciod if not yet running */ - error = rpciod_up(); - if (error < 0) { - dprintk("%s: couldn't start rpciod! Error = %d\n", - __FUNCTION__, error); - goto out_free; + /* Get a volume representation */ + server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr, + authflavour, &mntfh); + if (IS_ERR(server)) { + error = PTR_ERR(server); + goto out_err_noserver; } - s = sget(fs_type, nfs4_compare_super, nfs_set_super, server); - + /* Get a superblock - note that we may end up sharing one that already exists */ + s = sget(fs_type, nfs_compare_super, nfs_set_super, server); if (IS_ERR(s)) { error = PTR_ERR(s); goto out_free; } - if (s->s_root) { - kfree(server->mnt_path); - kfree(server->hostname); - kfree(server); - return simple_set_mnt(mnt, s); + if (s->s_fs_info != server) { + nfs_free_server(server); + server = NULL; } - s->s_flags = flags; + if (!s->s_root) { + /* initial superblock/root creation */ + s->s_flags = flags; + nfs4_fill_super(s); + } - error = nfs4_fill_super(s, data, flags & MS_SILENT ? 1 : 0); - if (error) { - up_write(&s->s_umount); - deactivate_super(s); - return error; + mntroot = nfs4_get_root(s, &mntfh); + if (IS_ERR(mntroot)) { + error = PTR_ERR(mntroot); + goto error_splat_super; } + s->s_flags |= MS_ACTIVE; - return simple_set_mnt(mnt, s); + mnt->mnt_sb = s; + mnt->mnt_root = mntroot; + kfree(mntpath); + kfree(hostname); + return 0; + out_err: error = PTR_ERR(p); + goto out_err_noserver; + out_free: - kfree(server->mnt_path); - kfree(server->hostname); - kfree(server); + nfs_free_server(server); +out_err_noserver: + kfree(mntpath); + kfree(hostname); return error; + +error_splat_super: + up_write(&s->s_umount); + deactivate_super(s); + goto out_err_noserver; } static void nfs4_kill_super(struct super_block *sb) @@ -1403,135 +935,140 @@ static void nfs4_kill_super(struct super_block *sb) kill_anon_super(sb); nfs4_renewd_prepare_shutdown(server); + nfs_free_server(server); +} + +/* + * Clone an NFS4 server record on xdev traversal (FSID-change) + */ +static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data, + struct vfsmount *mnt) +{ + struct nfs_clone_mount *data = raw_data; + struct super_block *s; + struct nfs_server *server; + struct dentry *mntroot; + int error; + + dprintk("--> nfs4_xdev_get_sb()\n"); + + /* create a new volume representation */ + server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); + if (IS_ERR(server)) { + error = PTR_ERR(server); + goto out_err_noserver; + } + + /* Get a superblock - note that we may end up sharing one that already exists */ + s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto out_err_nosb; + } - if (server->client != NULL && !IS_ERR(server->client)) - rpc_shutdown_client(server->client); + if (s->s_fs_info != server) { + nfs_free_server(server); + server = NULL; + } - destroy_nfsv4_state(server); + if (!s->s_root) { + /* initial superblock/root creation */ + s->s_flags = flags; + nfs4_clone_super(s, data->sb); + } + + mntroot = nfs4_get_root(s, data->fh); + if (IS_ERR(mntroot)) { + error = PTR_ERR(mntroot); + goto error_splat_super; + } - rpciod_down(); + s->s_flags |= MS_ACTIVE; + mnt->mnt_sb = s; + mnt->mnt_root = mntroot; + + dprintk("<-- nfs4_xdev_get_sb() = 0\n"); + return 0; + +out_err_nosb: + nfs_free_server(server); +out_err_noserver: + dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error); + return error; - nfs_free_iostats(server->io_stats); - kfree(server->hostname); - kfree(server); - nfs_release_automount_timer(); +error_splat_super: + up_write(&s->s_umount); + deactivate_super(s); + dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); + return error; } /* - * Constructs the SERVER-side path + * Create an NFS4 server record on referral traversal */ -static inline char *nfs4_dup_path(const struct dentry *dentry) +static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data, + struct vfsmount *mnt) { - char *page = (char *) __get_free_page(GFP_USER); - char *path; + struct nfs_clone_mount *data = raw_data; + struct super_block *s; + struct nfs_server *server; + struct dentry *mntroot; + struct nfs_fh mntfh; + int error; - path = nfs4_path(dentry, page, PAGE_SIZE); - if (!IS_ERR(path)) { - int len = PAGE_SIZE + page - path; - char *tmp = path; + dprintk("--> nfs4_referral_get_sb()\n"); - path = kmalloc(len, GFP_KERNEL); - if (path) - memcpy(path, tmp, len); - else - path = ERR_PTR(-ENOMEM); + /* create a new volume representation */ + server = nfs4_create_referral_server(data, &mntfh); + if (IS_ERR(server)) { + error = PTR_ERR(server); + goto out_err_noserver; } - free_page((unsigned long)page); - return path; -} -static struct super_block *nfs4_clone_sb(struct nfs_server *server, struct nfs_clone_mount *data) -{ - const struct dentry *dentry = data->dentry; - struct nfs4_client *clp = server->nfs4_state; - struct super_block *sb; - - server->fsid = data->fattr->fsid; - nfs_copy_fh(&server->fh, data->fh); - server->mnt_path = nfs4_dup_path(dentry); - if (IS_ERR(server->mnt_path)) { - sb = (struct super_block *)server->mnt_path; - goto err; + /* Get a superblock - note that we may end up sharing one that already exists */ + s = sget(&nfs_fs_type, nfs_compare_super, nfs_set_super, server); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto out_err_nosb; } - sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server); - if (IS_ERR(sb) || sb->s_root) - goto free_path; - nfs4_server_capabilities(server, &server->fh); - - down_write(&clp->cl_sem); - atomic_inc(&clp->cl_count); - list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks); - up_write(&clp->cl_sem); - return sb; -free_path: - kfree(server->mnt_path); -err: - server->mnt_path = NULL; - return sb; -} -static int nfs_clone_nfs4_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) -{ - struct nfs_clone_mount *data = raw_data; - return nfs_clone_generic_sb(data, nfs4_clone_sb, nfs_clone_server, mnt); -} + if (s->s_fs_info != server) { + nfs_free_server(server); + server = NULL; + } -static struct super_block *nfs4_referral_sb(struct nfs_server *server, struct nfs_clone_mount *data) -{ - struct super_block *sb = ERR_PTR(-ENOMEM); - int len; - - len = strlen(data->mnt_path) + 1; - server->mnt_path = kmalloc(len, GFP_KERNEL); - if (server->mnt_path == NULL) - goto err; - memcpy(server->mnt_path, data->mnt_path, len); - memcpy(&server->addr, data->addr, sizeof(struct sockaddr_in)); - - sb = sget(&nfs4_fs_type, nfs4_compare_super, nfs_set_super, server); - if (IS_ERR(sb) || sb->s_root) - goto free_path; - return sb; -free_path: - kfree(server->mnt_path); -err: - server->mnt_path = NULL; - return sb; -} + if (!s->s_root) { + /* initial superblock/root creation */ + s->s_flags = flags; + nfs4_fill_super(s); + } -static struct nfs_server *nfs4_referral_server(struct super_block *sb, struct nfs_clone_mount *data) -{ - struct nfs_server *server = NFS_SB(sb); - struct rpc_timeout timeparms; - int proto, timeo, retrans; - void *err; - - proto = IPPROTO_TCP; - /* Since we are following a referral and there may be alternatives, - set the timeouts and retries to low values */ - timeo = 2; - retrans = 1; - nfs_init_timeout_values(&timeparms, proto, timeo, retrans); - - server->client = nfs4_create_client(server, &timeparms, proto, data->authflavor); - if (IS_ERR((err = server->client))) - goto out_err; + mntroot = nfs4_get_root(s, data->fh); + if (IS_ERR(mntroot)) { + error = PTR_ERR(mntroot); + goto error_splat_super; + } - sb->s_time_gran = 1; - sb->s_op = &nfs4_sops; - err = ERR_PTR(nfs_sb_init(sb, data->authflavor)); - if (!IS_ERR(err)) - return server; -out_err: - return (struct nfs_server *)err; -} + s->s_flags |= MS_ACTIVE; + mnt->mnt_sb = s; + mnt->mnt_root = mntroot; -static int nfs_referral_nfs4_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) -{ - struct nfs_clone_mount *data = raw_data; - return nfs_clone_generic_sb(data, nfs4_referral_sb, nfs4_referral_server, mnt); + dprintk("<-- nfs4_referral_get_sb() = 0\n"); + return 0; + +out_err_nosb: + nfs_free_server(server); +out_err_noserver: + dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); + return error; + +error_splat_super: + up_write(&s->s_umount); + deactivate_super(s); + dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); + return error; } -#endif +#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7084ac9a6455..b674462793d3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -396,6 +396,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) out: clear_bit(BDI_write_congested, &bdi->state); wake_up_all(&nfs_write_congestion); + writeback_congestion_end(); return err; } @@ -1252,7 +1253,13 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) dprintk("NFS: %4d nfs_writeback_done (status %d)\n", task->tk_pid, task->tk_status); - /* Call the NFS version-specific code */ + /* + * ->write_done will attempt to use post-op attributes to detect + * conflicting writes by other clients. A strict interpretation + * of close-to-open would allow us to continue caching even if + * another writer had changed the file, but some applications + * depend on tighter cache coherency when writing. + */ status = NFS_PROTO(data->inode)->write_done(task, data); if (status != 0) return status; @@ -1273,7 +1280,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) if (time_before(complain, jiffies)) { dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", - NFS_SERVER(data->inode)->hostname, + NFS_SERVER(data->inode)->nfs_client->cl_hostname, resp->verf->committed, argp->stable); complain = jiffies + 300 * HZ; } @@ -1558,7 +1565,6 @@ void nfs_destroy_writepagecache(void) { mempool_destroy(nfs_commit_mempool); mempool_destroy(nfs_wdata_mempool); - if (kmem_cache_destroy(nfs_wdata_cachep)) - printk(KERN_INFO "nfs_write_data: not all structures were freed\n"); + kmem_cache_destroy(nfs_wdata_cachep); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 54b37b1d2e3a..8583d99ee740 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -375,16 +375,28 @@ nfsd4_probe_callback(struct nfs4_client *clp) { struct sockaddr_in addr; struct nfs4_callback *cb = &clp->cl_callback; - struct rpc_timeout timeparms; - struct rpc_xprt * xprt; + struct rpc_timeout timeparms = { + .to_initval = (NFSD_LEASE_TIME/4) * HZ, + .to_retries = 5, + .to_maxval = (NFSD_LEASE_TIME/2) * HZ, + .to_exponential = 1, + }; struct rpc_program * program = &cb->cb_program; - struct rpc_stat * stat = &cb->cb_stat; - struct rpc_clnt * clnt; + struct rpc_create_args args = { + .protocol = IPPROTO_TCP, + .address = (struct sockaddr *)&addr, + .addrsize = sizeof(addr), + .timeout = &timeparms, + .servername = clp->cl_name.data, + .program = program, + .version = nfs_cb_version[1]->number, + .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ + .flags = (RPC_CLNT_CREATE_NOPING), + }; struct rpc_message msg = { .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], .rpc_argp = clp, }; - char hostname[32]; int status; if (atomic_read(&cb->cb_set)) @@ -396,51 +408,27 @@ nfsd4_probe_callback(struct nfs4_client *clp) addr.sin_port = htons(cb->cb_port); addr.sin_addr.s_addr = htonl(cb->cb_addr); - /* Initialize timeout */ - timeparms.to_initval = (NFSD_LEASE_TIME/4) * HZ; - timeparms.to_retries = 0; - timeparms.to_maxval = (NFSD_LEASE_TIME/2) * HZ; - timeparms.to_exponential = 1; - - /* Create RPC transport */ - xprt = xprt_create_proto(IPPROTO_TCP, &addr, &timeparms); - if (IS_ERR(xprt)) { - dprintk("NFSD: couldn't create callback transport!\n"); - goto out_err; - } - /* Initialize rpc_program */ program->name = "nfs4_cb"; program->number = cb->cb_prog; program->nrvers = ARRAY_SIZE(nfs_cb_version); program->version = nfs_cb_version; - program->stats = stat; + program->stats = &cb->cb_stat; /* Initialize rpc_stat */ - memset(stat, 0, sizeof(struct rpc_stat)); - stat->program = program; - - /* Create RPC client - * - * XXX AUTH_UNIX only - need AUTH_GSS.... - */ - sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(addr.sin_addr.s_addr)); - clnt = rpc_new_client(xprt, hostname, program, 1, RPC_AUTH_UNIX); - if (IS_ERR(clnt)) { + memset(program->stats, 0, sizeof(cb->cb_stat)); + program->stats->program = program; + + /* Create RPC client */ + cb->cb_client = rpc_create(&args); + if (!cb->cb_client) { dprintk("NFSD: couldn't create callback client\n"); goto out_err; } - clnt->cl_intr = 0; - clnt->cl_softrtry = 1; /* Kick rpciod, put the call on the wire. */ - - if (rpciod_up() != 0) { - dprintk("nfsd: couldn't start rpciod for callbacks!\n"); + if (rpciod_up() != 0) goto out_clnt; - } - - cb->cb_client = clnt; /* the task holds a reference to the nfs4_client struct */ atomic_inc(&clp->cl_count); @@ -448,7 +436,7 @@ nfsd4_probe_callback(struct nfs4_client *clp) msg.rpc_cred = nfsd4_lookupcred(clp,0); if (IS_ERR(msg.rpc_cred)) goto out_rpciod; - status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL); + status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL); put_rpccred(msg.rpc_cred); if (status != 0) { @@ -462,7 +450,7 @@ out_rpciod: rpciod_down(); cb->cb_client = NULL; out_clnt: - rpc_shutdown_client(clnt); + rpc_shutdown_client(cb->cb_client); out_err: dprintk("NFSD: warning: no callback path to client %.*s\n", (int)clp->cl_name.len, clp->cl_name.data); diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index bea6b9478114..b1902ebaab41 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -573,10 +573,9 @@ idmap_lookup(struct svc_rqst *rqstp, struct idmap_defer_req *mdr; int ret; - mdr = kmalloc(sizeof(*mdr), GFP_KERNEL); + mdr = kzalloc(sizeof(*mdr), GFP_KERNEL); if (!mdr) return -ENOMEM; - memset(mdr, 0, sizeof(*mdr)); atomic_set(&mdr->count, 1); init_waitqueue_head(&mdr->waitq); mdr->req.defer = idmap_defer; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9daa0b9feb8d..ebcf226a9e4a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -339,8 +339,7 @@ alloc_client(struct xdr_netobj name) { struct nfs4_client *clp; - if ((clp = kmalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) { - memset(clp, 0, sizeof(*clp)); + if ((clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) { if ((clp->cl_name.data = kmalloc(name.len, GFP_KERNEL)) != NULL) { memcpy(clp->cl_name.data, name.data, name.len); clp->cl_name.len = name.len; @@ -1006,13 +1005,10 @@ alloc_init_file(struct inode *ino) static void nfsd4_free_slab(kmem_cache_t **slab) { - int status; - if (*slab == NULL) return; - status = kmem_cache_destroy(*slab); + kmem_cache_destroy(*slab); *slab = NULL; - WARN_ON(status); } static void diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index d1e2c6f9f05e..85c36b8ca452 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1149,8 +1149,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) * Allocate a buffer to store the current name being processed * converted to format determined by current NLS. */ - name = (u8*)kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, - GFP_NOFS); + name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS); if (unlikely(!name)) { err = -ENOMEM; goto err_out; @@ -1191,7 +1190,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) * map the mft record without deadlocking. */ rc = le32_to_cpu(ctx->attr->data.resident.value_length); - ir = (INDEX_ROOT*)kmalloc(rc, GFP_NOFS); + ir = kmalloc(rc, GFP_NOFS); if (unlikely(!ir)) { err = -ENOMEM; goto err_out; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index d313f356e66a..933dbd89c2a4 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -137,7 +137,7 @@ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) BUG_ON(!na->name); i = na->name_len * sizeof(ntfschar); - ni->name = (ntfschar*)kmalloc(i + sizeof(ntfschar), GFP_ATOMIC); + ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC); if (!ni->name) return -ENOMEM; memcpy(ni->name, na->name, i); @@ -556,8 +556,6 @@ static int ntfs_read_locked_inode(struct inode *vi) /* Setup the generic vfs inode parts now. */ - /* This is the optimal IO size (for stat), not the fs block size. */ - vi->i_blksize = PAGE_CACHE_SIZE; /* * This is for checking whether an inode has changed w.r.t. a file so * that the file can be updated if necessary (compare with f_version). @@ -1234,7 +1232,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) base_ni = NTFS_I(base_vi); /* Just mirror the values from the base inode. */ - vi->i_blksize = base_vi->i_blksize; vi->i_version = base_vi->i_version; vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; @@ -1504,7 +1501,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) ni = NTFS_I(vi); base_ni = NTFS_I(base_vi); /* Just mirror the values from the base inode. */ - vi->i_blksize = base_vi->i_blksize; vi->i_version = base_vi->i_version; vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 2438c00ec0ce..584260fd6848 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -331,7 +331,7 @@ map_err_out: ntfs_inode **tmp; int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *); - tmp = (ntfs_inode **)kmalloc(new_size, GFP_NOFS); + tmp = kmalloc(new_size, GFP_NOFS); if (unlikely(!tmp)) { ntfs_error(base_ni->vol->sb, "Failed to allocate " "internal buffer."); @@ -2638,11 +2638,6 @@ mft_rec_already_initialized: } vi->i_ino = bit; /* - * This is the optimal IO size (for stat), not the fs block - * size. - */ - vi->i_blksize = PAGE_CACHE_SIZE; - /* * This is for checking whether an inode has changed w.r.t. a * file so that the file can be updated if necessary (compare * with f_version). @@ -2893,7 +2888,7 @@ rollback: if (!(base_ni->nr_extents & 3)) { int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*); - extent_nis = (ntfs_inode**)kmalloc(new_size, GFP_NOFS); + extent_nis = kmalloc(new_size, GFP_NOFS); if (unlikely(!extent_nis)) { ntfs_error(vol->sb, "Failed to allocate internal " "buffer during rollback.%s", es); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 74e0ee8fce72..6b2712f10dd2 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3248,32 +3248,14 @@ ictx_err_out: static void __exit exit_ntfs_fs(void) { - int err = 0; - ntfs_debug("Unregistering NTFS driver."); unregister_filesystem(&ntfs_fs_type); - - if (kmem_cache_destroy(ntfs_big_inode_cache) && (err = 1)) - printk(KERN_CRIT "NTFS: Failed to destory %s.\n", - ntfs_big_inode_cache_name); - if (kmem_cache_destroy(ntfs_inode_cache) && (err = 1)) - printk(KERN_CRIT "NTFS: Failed to destory %s.\n", - ntfs_inode_cache_name); - if (kmem_cache_destroy(ntfs_name_cache) && (err = 1)) - printk(KERN_CRIT "NTFS: Failed to destory %s.\n", - ntfs_name_cache_name); - if (kmem_cache_destroy(ntfs_attr_ctx_cache) && (err = 1)) - printk(KERN_CRIT "NTFS: Failed to destory %s.\n", - ntfs_attr_ctx_cache_name); - if (kmem_cache_destroy(ntfs_index_ctx_cache) && (err = 1)) - printk(KERN_CRIT "NTFS: Failed to destory %s.\n", - ntfs_index_ctx_cache_name); - if (err) - printk(KERN_CRIT "NTFS: This causes memory to leak! There is " - "probably a BUG in the driver! Please report " - "you saw this message to " - "linux-ntfs-dev@lists.sourceforge.net\n"); + kmem_cache_destroy(ntfs_big_inode_cache); + kmem_cache_destroy(ntfs_inode_cache); + kmem_cache_destroy(ntfs_name_cache); + kmem_cache_destroy(ntfs_attr_ctx_cache); + kmem_cache_destroy(ntfs_index_ctx_cache); /* Unregister the ntfs sysctls. */ ntfs_sysctl(0); } diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c index b123c0fa6bf6..a1b572196fe4 100644 --- a/fs/ntfs/unistr.c +++ b/fs/ntfs/unistr.c @@ -350,7 +350,7 @@ int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, } if (!ns) { ns_len = ins_len * NLS_MAX_CHARSET_SIZE; - ns = (unsigned char*)kmalloc(ns_len + 1, GFP_NOFS); + ns = kmalloc(ns_len + 1, GFP_NOFS); if (!ns) goto mem_err_out; } @@ -365,7 +365,7 @@ retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, else if (wc == -ENAMETOOLONG && ns != *outs) { unsigned char *tc; /* Grow in multiples of 64 bytes. */ - tc = (unsigned char*)kmalloc((ns_len + 64) & + tc = kmalloc((ns_len + 64) & ~63, GFP_NOFS); if (tc) { memcpy(tc, ns, ns_len); diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index ff9e2e2104c2..4b46aac7d243 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -44,11 +44,17 @@ * locking semantics of the file system using the protocol. It should * be somewhere else, I'm sure, but right now it isn't. * + * New in version 4: + * - Remove i_generation from lock names for better stat performance. + * + * New in version 3: + * - Replace dentry votes with a cluster lock + * * New in version 2: * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 2ULL +#define O2NET_PROTOCOL_VERSION 4ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 1a01380e3878..014e73978dac 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -35,15 +35,17 @@ #include "alloc.h" #include "dcache.h" +#include "dlmglue.h" #include "file.h" #include "inode.h" + static int ocfs2_dentry_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; int ret = 0; /* if all else fails, just return false */ - struct ocfs2_super *osb; + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); @@ -55,28 +57,31 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, goto bail; } - osb = OCFS2_SB(inode->i_sb); - BUG_ON(!osb); - if (inode != osb->root_inode) { - spin_lock(&OCFS2_I(inode)->ip_lock); - /* did we or someone else delete this inode? */ - if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { - spin_unlock(&OCFS2_I(inode)->ip_lock); - mlog(0, "inode (%llu) deleted, returning false\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - goto bail; - } + if (inode == osb->root_inode || is_bad_inode(inode)) + goto bail; + + spin_lock(&OCFS2_I(inode)->ip_lock); + /* did we or someone else delete this inode? */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { spin_unlock(&OCFS2_I(inode)->ip_lock); + mlog(0, "inode (%llu) deleted, returning false\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + goto bail; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); - if (!inode->i_nlink) { - mlog(0, "Inode %llu orphaned, returning false " - "dir = %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - S_ISDIR(inode->i_mode)); - goto bail; - } + /* + * We don't need a cluster lock to test this because once an + * inode nlink hits zero, it never goes back. + */ + if (inode->i_nlink == 0) { + mlog(0, "Inode %llu orphaned, returning false " + "dir = %d\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + S_ISDIR(inode->i_mode)); + goto bail; } ret = 1; @@ -87,6 +92,322 @@ bail: return ret; } +static int ocfs2_match_dentry(struct dentry *dentry, + u64 parent_blkno, + int skip_unhashed) +{ + struct inode *parent; + + /* + * ocfs2_lookup() does a d_splice_alias() _before_ attaching + * to the lock data, so we skip those here, otherwise + * ocfs2_dentry_attach_lock() will get its original dentry + * back. + */ + if (!dentry->d_fsdata) + return 0; + + if (!dentry->d_parent) + return 0; + + if (skip_unhashed && d_unhashed(dentry)) + return 0; + + parent = dentry->d_parent->d_inode; + /* Negative parent dentry? */ + if (!parent) + return 0; + + /* Name is in a different directory. */ + if (OCFS2_I(parent)->ip_blkno != parent_blkno) + return 0; + + return 1; +} + +/* + * Walk the inode alias list, and find a dentry which has a given + * parent. ocfs2_dentry_attach_lock() wants to find _any_ alias as it + * is looking for a dentry_lock reference. The vote thread is looking + * to unhash aliases, so we allow it to skip any that already have + * that property. + */ +struct dentry *ocfs2_find_local_alias(struct inode *inode, + u64 parent_blkno, + int skip_unhashed) +{ + struct list_head *p; + struct dentry *dentry = NULL; + + spin_lock(&dcache_lock); + + list_for_each(p, &inode->i_dentry) { + dentry = list_entry(p, struct dentry, d_alias); + + if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { + mlog(0, "dentry found: %.*s\n", + dentry->d_name.len, dentry->d_name.name); + + dget_locked(dentry); + break; + } + + dentry = NULL; + } + + spin_unlock(&dcache_lock); + + return dentry; +} + +DEFINE_SPINLOCK(dentry_attach_lock); + +/* + * Attach this dentry to a cluster lock. + * + * Dentry locks cover all links in a given directory to a particular + * inode. We do this so that ocfs2 can build a lock name which all + * nodes in the cluster can agree on at all times. Shoving full names + * in the cluster lock won't work due to size restrictions. Covering + * links inside of a directory is a good compromise because it still + * allows us to use the parent directory lock to synchronize + * operations. + * + * Call this function with the parent dir semaphore and the parent dir + * cluster lock held. + * + * The dir semaphore will protect us from having to worry about + * concurrent processes on our node trying to attach a lock at the + * same time. + * + * The dir cluster lock (held at either PR or EX mode) protects us + * from unlink and rename on other nodes. + * + * A dput() can happen asynchronously due to pruning, so we cover + * attaching and detaching the dentry lock with a + * dentry_attach_lock. + * + * A node which has done lookup on a name retains a protected read + * lock until final dput. If the user requests and unlink or rename, + * the protected read is upgraded to an exclusive lock. Other nodes + * who have seen the dentry will then be informed that they need to + * downgrade their lock, which will involve d_delete on the + * dentry. This happens in ocfs2_dentry_convert_worker(). + */ +int ocfs2_dentry_attach_lock(struct dentry *dentry, + struct inode *inode, + u64 parent_blkno) +{ + int ret; + struct dentry *alias; + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + mlog(0, "Attach \"%.*s\", parent %llu, fsdata: %p\n", + dentry->d_name.len, dentry->d_name.name, + (unsigned long long)parent_blkno, dl); + + /* + * Negative dentry. We ignore these for now. + * + * XXX: Could we can improve ocfs2_dentry_revalidate() by + * tracking these? + */ + if (!inode) + return 0; + + if (dl) { + mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, + " \"%.*s\": old parent: %llu, new: %llu\n", + dentry->d_name.len, dentry->d_name.name, + (unsigned long long)parent_blkno, + (unsigned long long)dl->dl_parent_blkno); + return 0; + } + + alias = ocfs2_find_local_alias(inode, parent_blkno, 0); + if (alias) { + /* + * Great, an alias exists, which means we must have a + * dentry lock already. We can just grab the lock off + * the alias and add it to the list. + * + * We're depending here on the fact that this dentry + * was found and exists in the dcache and so must have + * a reference to the dentry_lock because we can't + * race creates. Final dput() cannot happen on it + * since we have it pinned, so our reference is safe. + */ + dl = alias->d_fsdata; + mlog_bug_on_msg(!dl, "parent %llu, ino %llu\n", + (unsigned long long)parent_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, + " \"%.*s\": old parent: %llu, new: %llu\n", + dentry->d_name.len, dentry->d_name.name, + (unsigned long long)parent_blkno, + (unsigned long long)dl->dl_parent_blkno); + + mlog(0, "Found: %s\n", dl->dl_lockres.l_name); + + goto out_attach; + } + + /* + * There are no other aliases + */ + dl = kmalloc(sizeof(*dl), GFP_NOFS); + if (!dl) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + dl->dl_count = 0; + /* + * Does this have to happen below, for all attaches, in case + * the struct inode gets blown away by votes? + */ + dl->dl_inode = igrab(inode); + dl->dl_parent_blkno = parent_blkno; + ocfs2_dentry_lock_res_init(dl, parent_blkno, inode); + +out_attach: + spin_lock(&dentry_attach_lock); + dentry->d_fsdata = dl; + dl->dl_count++; + spin_unlock(&dentry_attach_lock); + + /* + * This actually gets us our PRMODE level lock. From now on, + * we'll have a notification if one of these names is + * destroyed on another node. + */ + ret = ocfs2_dentry_lock(dentry, 0); + if (!ret) + ocfs2_dentry_unlock(dentry, 0); + else + mlog_errno(ret); + + dput(alias); + + return ret; +} + +/* + * ocfs2_dentry_iput() and friends. + * + * At this point, our particular dentry is detached from the inodes + * alias list, so there's no way that the locking code can find it. + * + * The interesting stuff happens when we determine that our lock needs + * to go away because this is the last subdir alias in the + * system. This function needs to handle a couple things: + * + * 1) Synchronizing lock shutdown with the downconvert threads. This + * is already handled for us via the lockres release drop function + * called in ocfs2_release_dentry_lock() + * + * 2) A race may occur when we're doing our lock shutdown and + * another process wants to create a new dentry lock. Right now we + * let them race, which means that for a very short while, this + * node might have two locks on a lock resource. This should be a + * problem though because one of them is in the process of being + * thrown out. + */ +static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, + struct ocfs2_dentry_lock *dl) +{ + ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); + ocfs2_lock_res_free(&dl->dl_lockres); + iput(dl->dl_inode); + kfree(dl); +} + +void ocfs2_dentry_lock_put(struct ocfs2_super *osb, + struct ocfs2_dentry_lock *dl) +{ + int unlock = 0; + + BUG_ON(dl->dl_count == 0); + + spin_lock(&dentry_attach_lock); + dl->dl_count--; + unlock = !dl->dl_count; + spin_unlock(&dentry_attach_lock); + + if (unlock) + ocfs2_drop_dentry_lock(osb, dl); +} + +static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode) +{ + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + + mlog_bug_on_msg(!dl && !(dentry->d_flags & DCACHE_DISCONNECTED), + "dentry: %.*s\n", dentry->d_name.len, + dentry->d_name.name); + + if (!dl) + goto out; + + mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n", + dentry->d_name.len, dentry->d_name.name, + dl->dl_count); + + ocfs2_dentry_lock_put(OCFS2_SB(dentry->d_sb), dl); + +out: + iput(inode); +} + +/* + * d_move(), but keep the locks in sync. + * + * When we are done, "dentry" will have the parent dir and name of + * "target", which will be thrown away. + * + * We manually update the lock of "dentry" if need be. + * + * "target" doesn't have it's dentry lock touched - we allow the later + * dput() to handle this for us. + * + * This is called during ocfs2_rename(), while holding parent + * directory locks. The dentries have already been deleted on other + * nodes via ocfs2_remote_dentry_delete(). + * + * Normally, the VFS handles the d_move() for the file sytem, after + * the ->rename() callback. OCFS2 wants to handle this internally, so + * the new lock can be created atomically with respect to the cluster. + */ +void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, + struct inode *old_dir, struct inode *new_dir) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb); + struct inode *inode = dentry->d_inode; + + /* + * Move within the same directory, so the actual lock info won't + * change. + * + * XXX: Is there any advantage to dropping the lock here? + */ + if (old_dir == new_dir) + goto out_move; + + ocfs2_dentry_lock_put(osb, dentry->d_fsdata); + + dentry->d_fsdata = NULL; + ret = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(new_dir)->ip_blkno); + if (ret) + mlog_errno(ret); + +out_move: + d_move(dentry, target); +} + struct dentry_operations ocfs2_dentry_ops = { .d_revalidate = ocfs2_dentry_revalidate, + .d_iput = ocfs2_dentry_iput, }; diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index 90072771114b..c091c34d9883 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -28,4 +28,31 @@ extern struct dentry_operations ocfs2_dentry_ops; +struct ocfs2_dentry_lock { + unsigned int dl_count; + u64 dl_parent_blkno; + + /* + * The ocfs2_dentry_lock keeps an inode reference until + * dl_lockres has been destroyed. This is usually done in + * ->d_iput() anyway, so there should be minimal impact. + */ + struct inode *dl_inode; + struct ocfs2_lock_res dl_lockres; +}; + +int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, + u64 parent_blkno); + +void ocfs2_dentry_lock_put(struct ocfs2_super *osb, + struct ocfs2_dentry_lock *dl); + +struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, + int skip_unhashed); + +void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, + struct inode *old_dir, struct inode *new_dir); + +extern spinlock_t dentry_attach_lock; + #endif /* OCFS2_DCACHE_H */ diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h index 53652f51c0e1..cfd5cb65cab0 100644 --- a/fs/ocfs2/dlm/dlmapi.h +++ b/fs/ocfs2/dlm/dlmapi.h @@ -182,6 +182,7 @@ enum dlm_status dlmlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb, int flags, const char *name, + int namelen, dlm_astlockfunc_t *ast, void *data, dlm_bastlockfunc_t *bast); diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index f13a4bac41f0..681046d51393 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -320,8 +320,8 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) res = dlm_lookup_lockres(dlm, name, locklen); if (!res) { - mlog(ML_ERROR, "got %sast for unknown lockres! " - "cookie=%u:%llu, name=%.*s, namelen=%u\n", + mlog(0, "got %sast for unknown lockres! " + "cookie=%u:%llu, name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", dlm_get_lock_cookie_node(cookie), dlm_get_lock_cookie_seq(cookie), @@ -462,7 +462,7 @@ int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, mlog(ML_ERROR, "sent AST to node %u, it returned " "DLM_MIGRATING!\n", lock->ml.node); BUG(); - } else if (status != DLM_NORMAL) { + } else if (status != DLM_NORMAL && status != DLM_IVLOCKID) { mlog(ML_ERROR, "AST to node %u returned %d!\n", lock->ml.node, status); /* ignore it */ diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 14530ee7e11d..fa968180b072 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -747,6 +747,7 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm, u8 owner); struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, const char *lockid, + int namelen, int flags); struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, const char *name, diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 033ad1701232..0368c6402182 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -335,7 +335,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) inode->i_mode = mode; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -362,7 +361,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inode->i_mode = mode; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -629,9 +627,7 @@ static void __exit exit_dlmfs_fs(void) flush_workqueue(user_dlm_worker); destroy_workqueue(user_dlm_worker); - if (kmem_cache_destroy(dlmfs_inode_cache)) - printk(KERN_INFO "dlmfs_inode_cache: not all structures " - "were freed\n"); + kmem_cache_destroy(dlmfs_inode_cache); } MODULE_AUTHOR("Oracle"); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 5ca57ec650c7..42a1b91979b5 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -540,8 +540,8 @@ static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie) enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode, struct dlm_lockstatus *lksb, int flags, - const char *name, dlm_astlockfunc_t *ast, void *data, - dlm_bastlockfunc_t *bast) + const char *name, int namelen, dlm_astlockfunc_t *ast, + void *data, dlm_bastlockfunc_t *bast) { enum dlm_status status; struct dlm_lock_resource *res = NULL; @@ -571,7 +571,7 @@ enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode, recovery = (flags & LKM_RECOVERY); if (recovery && - (!dlm_is_recovery_lock(name, strlen(name)) || convert) ) { + (!dlm_is_recovery_lock(name, namelen) || convert) ) { dlm_error(status); goto error; } @@ -643,7 +643,7 @@ retry_convert: } status = DLM_IVBUFLEN; - if (strlen(name) > DLM_LOCKID_NAME_MAX || strlen(name) < 1) { + if (namelen > DLM_LOCKID_NAME_MAX || namelen < 1) { dlm_error(status); goto error; } @@ -659,7 +659,7 @@ retry_convert: dlm_wait_for_recovery(dlm); /* find or create the lock resource */ - res = dlm_get_lock_resource(dlm, name, flags); + res = dlm_get_lock_resource(dlm, name, namelen, flags); if (!res) { status = DLM_IVLOCKID; dlm_error(status); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9503240ef0e5..f784177b6241 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -740,6 +740,7 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, */ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, const char *lockid, + int namelen, int flags) { struct dlm_lock_resource *tmpres=NULL, *res=NULL; @@ -748,13 +749,12 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, int blocked = 0; int ret, nodenum; struct dlm_node_iter iter; - unsigned int namelen, hash; + unsigned int hash; int tries = 0; int bit, wait_on_recovery = 0; BUG_ON(!lockid); - namelen = strlen(lockid); hash = dlm_lockid_hash(lockid, namelen); mlog(0, "get lockres %s (len %d)\n", lockid, namelen); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 594745fab0b5..9d950d7cea38 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2285,7 +2285,8 @@ again: memset(&lksb, 0, sizeof(lksb)); ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, - DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); + DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN, + dlm_reco_ast, dlm, dlm_reco_bast); mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n", dlm->name, ret, lksb.status); diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index e641b084b343..eead48bbfac6 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -102,10 +102,10 @@ static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) spin_unlock(&lockres->l_lock); } -#define user_log_dlm_error(_func, _stat, _lockres) do { \ - mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ - "resource %s: %s\n", dlm_errname(_stat), _func, \ - _lockres->l_name, dlm_errmsg(_stat)); \ +#define user_log_dlm_error(_func, _stat, _lockres) do { \ + mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ + "resource %.*s: %s\n", dlm_errname(_stat), _func, \ + _lockres->l_namelen, _lockres->l_name, dlm_errmsg(_stat)); \ } while (0) /* WARNING: This function lives in a world where the only three lock @@ -127,21 +127,22 @@ static void user_ast(void *opaque) struct user_lock_res *lockres = opaque; struct dlm_lockstatus *lksb; - mlog(0, "AST fired for lockres %s\n", lockres->l_name); + mlog(0, "AST fired for lockres %.*s\n", lockres->l_namelen, + lockres->l_name); spin_lock(&lockres->l_lock); lksb = &(lockres->l_lksb); if (lksb->status != DLM_NORMAL) { - mlog(ML_ERROR, "lksb status value of %u on lockres %s\n", - lksb->status, lockres->l_name); + mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", + lksb->status, lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); return; } mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, - "Lockres %s, requested ivmode. flags 0x%x\n", - lockres->l_name, lockres->l_flags); + "Lockres %.*s, requested ivmode. flags 0x%x\n", + lockres->l_namelen, lockres->l_name, lockres->l_flags); /* we're downconverting. */ if (lockres->l_requested < lockres->l_level) { @@ -213,8 +214,8 @@ static void user_bast(void *opaque, int level) { struct user_lock_res *lockres = opaque; - mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n", - lockres->l_name, level); + mlog(0, "Blocking AST fired for lockres %.*s. Blocking level %d\n", + lockres->l_namelen, lockres->l_name, level); spin_lock(&lockres->l_lock); lockres->l_flags |= USER_LOCK_BLOCKED; @@ -231,7 +232,8 @@ static void user_unlock_ast(void *opaque, enum dlm_status status) { struct user_lock_res *lockres = opaque; - mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name); + mlog(0, "UNLOCK AST called on lock %.*s\n", lockres->l_namelen, + lockres->l_name); if (status != DLM_NORMAL && status != DLM_CANCELGRANT) mlog(ML_ERROR, "Dlm returns status %d\n", status); @@ -244,8 +246,6 @@ static void user_unlock_ast(void *opaque, enum dlm_status status) && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { lockres->l_level = LKM_IVMODE; } else if (status == DLM_CANCELGRANT) { - mlog(0, "Lock %s, cancel fails, flags 0x%x\n", - lockres->l_name, lockres->l_flags); /* We tried to cancel a convert request, but it was * already granted. Don't clear the busy flag - the * ast should've done this already. */ @@ -255,8 +255,6 @@ static void user_unlock_ast(void *opaque, enum dlm_status status) } else { BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); /* Cancel succeeded, we want to re-queue */ - mlog(0, "Lock %s, cancel succeeds, flags 0x%x\n", - lockres->l_name, lockres->l_flags); lockres->l_requested = LKM_IVMODE; /* cancel an * upconvert * request. */ @@ -287,13 +285,14 @@ static void user_dlm_unblock_lock(void *opaque) struct user_lock_res *lockres = (struct user_lock_res *) opaque; struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); - mlog(0, "processing lockres %s\n", lockres->l_name); + mlog(0, "processing lockres %.*s\n", lockres->l_namelen, + lockres->l_name); spin_lock(&lockres->l_lock); mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), - "Lockres %s, flags 0x%x\n", - lockres->l_name, lockres->l_flags); + "Lockres %.*s, flags 0x%x\n", + lockres->l_namelen, lockres->l_name, lockres->l_flags); /* notice that we don't clear USER_LOCK_BLOCKED here. If it's * set, we want user_ast clear it. */ @@ -305,22 +304,16 @@ static void user_dlm_unblock_lock(void *opaque) * flag, and finally we might get another bast which re-queues * us before our ast for the downconvert is called. */ if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { - mlog(0, "Lockres %s, flags 0x%x: queued but not blocking\n", - lockres->l_name, lockres->l_flags); spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { - mlog(0, "lock is in teardown so we do nothing\n"); spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_BUSY) { - mlog(0, "Cancel lock %s, flags 0x%x\n", - lockres->l_name, lockres->l_flags); - if (lockres->l_flags & USER_LOCK_IN_CANCEL) { spin_unlock(&lockres->l_lock); goto drop_ref; @@ -372,6 +365,7 @@ static void user_dlm_unblock_lock(void *opaque) &lockres->l_lksb, LKM_CONVERT|LKM_VALBLK, lockres->l_name, + lockres->l_namelen, user_ast, lockres, user_bast); @@ -420,16 +414,16 @@ int user_dlm_cluster_lock(struct user_lock_res *lockres, if (level != LKM_EXMODE && level != LKM_PRMODE) { - mlog(ML_ERROR, "lockres %s: invalid request!\n", - lockres->l_name); + mlog(ML_ERROR, "lockres %.*s: invalid request!\n", + lockres->l_namelen, lockres->l_name); status = -EINVAL; goto bail; } - mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n", - lockres->l_name, - (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE", - lkm_flags); + mlog(0, "lockres %.*s: asking for %s lock, passed flags = 0x%x\n", + lockres->l_namelen, lockres->l_name, + (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE", + lkm_flags); again: if (signal_pending(current)) { @@ -474,15 +468,13 @@ again: BUG_ON(level == LKM_IVMODE); BUG_ON(level == LKM_NLMODE); - mlog(0, "lock %s, get lock from %d to level = %d\n", - lockres->l_name, lockres->l_level, level); - /* call dlm_lock to upgrade lock now */ status = dlmlock(dlm, level, &lockres->l_lksb, local_flags, lockres->l_name, + lockres->l_namelen, user_ast, lockres, user_bast); @@ -498,9 +490,6 @@ again: goto bail; } - mlog(0, "lock %s, successfull return from dlmlock\n", - lockres->l_name); - user_wait_on_busy_lock(lockres); goto again; } @@ -508,9 +497,6 @@ again: user_dlm_inc_holders(lockres, level); spin_unlock(&lockres->l_lock); - mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name, - (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE"); - status = 0; bail: return status; @@ -538,13 +524,11 @@ void user_dlm_cluster_unlock(struct user_lock_res *lockres, { if (level != LKM_EXMODE && level != LKM_PRMODE) { - mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name); + mlog(ML_ERROR, "lockres %.*s: invalid request!\n", + lockres->l_namelen, lockres->l_name); return; } - mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name, - (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE"); - spin_lock(&lockres->l_lock); user_dlm_dec_holders(lockres, level); __user_dlm_cond_queue_lockres(lockres); @@ -602,6 +586,7 @@ void user_dlm_lock_res_init(struct user_lock_res *lockres, memcpy(lockres->l_name, dentry->d_name.name, dentry->d_name.len); + lockres->l_namelen = dentry->d_name.len; } int user_dlm_destroy_lock(struct user_lock_res *lockres) @@ -609,11 +594,10 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) int status = -EBUSY; struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); - mlog(0, "asked to destroy %s\n", lockres->l_name); + mlog(0, "asked to destroy %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { - mlog(0, "Lock is already torn down\n"); spin_unlock(&lockres->l_lock); return 0; } @@ -623,8 +607,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) while (lockres->l_flags & USER_LOCK_BUSY) { spin_unlock(&lockres->l_lock); - mlog(0, "lock %s is busy\n", lockres->l_name); - user_wait_on_busy_lock(lockres); spin_lock(&lockres->l_lock); @@ -632,14 +614,12 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) if (lockres->l_ro_holders || lockres->l_ex_holders) { spin_unlock(&lockres->l_lock); - mlog(0, "lock %s has holders\n", lockres->l_name); goto bail; } status = 0; if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { spin_unlock(&lockres->l_lock); - mlog(0, "lock %s is not attached\n", lockres->l_name); goto bail; } @@ -647,7 +627,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); - mlog(0, "unlocking lockres %s\n", lockres->l_name); status = dlmunlock(dlm, &lockres->l_lksb, LKM_VALBLK, diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h index 04178bc40b76..c400e93bbf79 100644 --- a/fs/ocfs2/dlm/userdlm.h +++ b/fs/ocfs2/dlm/userdlm.h @@ -53,6 +53,7 @@ struct user_lock_res { #define USER_DLM_LOCK_ID_MAX_LEN 32 char l_name[USER_DLM_LOCK_ID_MAX_LEN]; + int l_namelen; int l_level; unsigned int l_ro_holders; unsigned int l_ex_holders; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 151b41781eab..8801e41afe80 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -46,6 +46,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "dcache.h" #include "dlmglue.h" #include "extent_map.h" #include "heartbeat.h" @@ -66,78 +67,161 @@ struct ocfs2_mask_waiter { unsigned long mw_goal; }; -static void ocfs2_inode_ast_func(void *opaque); -static void ocfs2_inode_bast_func(void *opaque, - int level); -static void ocfs2_super_ast_func(void *opaque); -static void ocfs2_super_bast_func(void *opaque, - int level); -static void ocfs2_rename_ast_func(void *opaque); -static void ocfs2_rename_bast_func(void *opaque, - int level); - -/* so far, all locks have gotten along with the same unlock ast */ -static void ocfs2_unlock_ast_func(void *opaque, - enum dlm_status status); -static int ocfs2_do_unblock_meta(struct inode *inode, - int *requeue); -static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, - int *requeue); -static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, - int *requeue); -static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, - int *requeue); -static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, - int *requeue); -typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int); -static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, - struct ocfs2_lock_res *lockres, - int *requeue, - ocfs2_convert_worker_t *worker); +static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); +static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); +/* + * Return value from ->downconvert_worker functions. + * + * These control the precise actions of ocfs2_unblock_lock() + * and ocfs2_process_blocked_lock() + * + */ +enum ocfs2_unblock_action { + UNBLOCK_CONTINUE = 0, /* Continue downconvert */ + UNBLOCK_CONTINUE_POST = 1, /* Continue downconvert, fire + * ->post_unlock callback */ + UNBLOCK_STOP_POST = 2, /* Do not downconvert, fire + * ->post_unlock() callback. */ +}; + +struct ocfs2_unblock_ctl { + int requeue; + enum ocfs2_unblock_action unblock_action; +}; + +static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, + int new_level); +static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres); + +static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + +static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + +static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + +/* + * OCFS2 Lock Resource Operations + * + * These fine tune the behavior of the generic dlmglue locking infrastructure. + * + * The most basic of lock types can point ->l_priv to their respective + * struct ocfs2_super and allow the default actions to manage things. + * + * Right now, each lock type also needs to implement an init function, + * and trivial lock/unlock wrappers. ocfs2_simple_drop_lockres() + * should be called when the lock is no longer needed (i.e., object + * destruction time). + */ struct ocfs2_lock_res_ops { - void (*ast)(void *); - void (*bast)(void *, int); - void (*unlock_ast)(void *, enum dlm_status); - int (*unblock)(struct ocfs2_lock_res *, int *); + /* + * Translate an ocfs2_lock_res * into an ocfs2_super *. Define + * this callback if ->l_priv is not an ocfs2_super pointer + */ + struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *); + + /* + * Optionally called in the downconvert (or "vote") thread + * after a successful downconvert. The lockres will not be + * referenced after this callback is called, so it is safe to + * free memory, etc. + * + * The exact semantics of when this is called are controlled + * by ->downconvert_worker() + */ + void (*post_unlock)(struct ocfs2_super *, struct ocfs2_lock_res *); + + /* + * Allow a lock type to add checks to determine whether it is + * safe to downconvert a lock. Return 0 to re-queue the + * downconvert at a later time, nonzero to continue. + * + * For most locks, the default checks that there are no + * incompatible holders are sufficient. + * + * Called with the lockres spinlock held. + */ + int (*check_downconvert)(struct ocfs2_lock_res *, int); + + /* + * Allows a lock type to populate the lock value block. This + * is called on downconvert, and when we drop a lock. + * + * Locks that want to use this should set LOCK_TYPE_USES_LVB + * in the flags field. + * + * Called with the lockres spinlock held. + */ + void (*set_lvb)(struct ocfs2_lock_res *); + + /* + * Called from the downconvert thread when it is determined + * that a lock will be downconverted. This is called without + * any locks held so the function can do work that might + * schedule (syncing out data, etc). + * + * This should return any one of the ocfs2_unblock_action + * values, depending on what it wants the thread to do. + */ + int (*downconvert_worker)(struct ocfs2_lock_res *, int); + + /* + * LOCK_TYPE_* flags which describe the specific requirements + * of a lock type. Descriptions of each individual flag follow. + */ + int flags; }; +/* + * Some locks want to "refresh" potentially stale data when a + * meaningful (PRMODE or EXMODE) lock level is first obtained. If this + * flag is set, the OCFS2_LOCK_NEEDS_REFRESH flag will be set on the + * individual lockres l_flags member from the ast function. It is + * expected that the locking wrapper will clear the + * OCFS2_LOCK_NEEDS_REFRESH flag when done. + */ +#define LOCK_TYPE_REQUIRES_REFRESH 0x1 + +/* + * Indicate that a lock type makes use of the lock value block. The + * ->set_lvb lock type callback must be defined. + */ +#define LOCK_TYPE_USES_LVB 0x2 + static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { - .ast = ocfs2_inode_ast_func, - .bast = ocfs2_inode_bast_func, - .unlock_ast = ocfs2_unlock_ast_func, - .unblock = ocfs2_unblock_inode_lock, + .get_osb = ocfs2_get_inode_osb, + .flags = 0, }; static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { - .ast = ocfs2_inode_ast_func, - .bast = ocfs2_inode_bast_func, - .unlock_ast = ocfs2_unlock_ast_func, - .unblock = ocfs2_unblock_meta, + .get_osb = ocfs2_get_inode_osb, + .check_downconvert = ocfs2_check_meta_downconvert, + .set_lvb = ocfs2_set_meta_lvb, + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, }; -static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, - int blocking); - static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = { - .ast = ocfs2_inode_ast_func, - .bast = ocfs2_inode_bast_func, - .unlock_ast = ocfs2_unlock_ast_func, - .unblock = ocfs2_unblock_data, + .get_osb = ocfs2_get_inode_osb, + .downconvert_worker = ocfs2_data_convert_worker, + .flags = 0, }; static struct ocfs2_lock_res_ops ocfs2_super_lops = { - .ast = ocfs2_super_ast_func, - .bast = ocfs2_super_bast_func, - .unlock_ast = ocfs2_unlock_ast_func, - .unblock = ocfs2_unblock_osb_lock, + .flags = LOCK_TYPE_REQUIRES_REFRESH, }; static struct ocfs2_lock_res_ops ocfs2_rename_lops = { - .ast = ocfs2_rename_ast_func, - .bast = ocfs2_rename_bast_func, - .unlock_ast = ocfs2_unlock_ast_func, - .unblock = ocfs2_unblock_osb_lock, + .flags = 0, +}; + +static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { + .get_osb = ocfs2_get_dentry_osb, + .post_unlock = ocfs2_dentry_post_unlock, + .downconvert_worker = ocfs2_dentry_convert_worker, + .flags = 0, }; static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) @@ -147,29 +231,26 @@ static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) lockres->l_type == OCFS2_LOCK_TYPE_RW; } -static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres) +static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) { - return lockres->l_type == OCFS2_LOCK_TYPE_SUPER; -} + BUG_ON(!ocfs2_is_inode_lock(lockres)); -static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres) -{ - return lockres->l_type == OCFS2_LOCK_TYPE_RENAME; + return (struct inode *) lockres->l_priv; } -static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres) +static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res *lockres) { - BUG_ON(!ocfs2_is_super_lock(lockres) - && !ocfs2_is_rename_lock(lockres)); + BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_DENTRY); - return (struct ocfs2_super *) lockres->l_priv; + return (struct ocfs2_dentry_lock *)lockres->l_priv; } -static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) +static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) { - BUG_ON(!ocfs2_is_inode_lock(lockres)); + if (lockres->l_ops->get_osb) + return lockres->l_ops->get_osb(lockres); - return (struct inode *) lockres->l_priv; + return (struct ocfs2_super *)lockres->l_priv; } static int ocfs2_lock_create(struct ocfs2_super *osb, @@ -200,25 +281,6 @@ static int ocfs2_meta_lock_update(struct inode *inode, struct buffer_head **bh); static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); static inline int ocfs2_highest_compat_lock_level(int level); -static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, - struct ocfs2_lock_res *lockres, - int new_level); - -static char *ocfs2_lock_type_strings[] = { - [OCFS2_LOCK_TYPE_META] = "Meta", - [OCFS2_LOCK_TYPE_DATA] = "Data", - [OCFS2_LOCK_TYPE_SUPER] = "Super", - [OCFS2_LOCK_TYPE_RENAME] = "Rename", - /* Need to differntiate from [R]ename.. serializing writes is the - * important job it does, anyway. */ - [OCFS2_LOCK_TYPE_RW] = "Write/Read", -}; - -static char *ocfs2_lock_type_string(enum ocfs2_lock_type type) -{ - mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); - return ocfs2_lock_type_strings[type]; -} static void ocfs2_build_lock_name(enum ocfs2_lock_type type, u64 blkno, @@ -265,13 +327,9 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, struct ocfs2_lock_res *res, enum ocfs2_lock_type type, - u64 blkno, - u32 generation, struct ocfs2_lock_res_ops *ops, void *priv) { - ocfs2_build_lock_name(type, blkno, generation, res->l_name); - res->l_type = type; res->l_ops = ops; res->l_priv = priv; @@ -299,6 +357,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, enum ocfs2_lock_type type, + unsigned int generation, struct inode *inode) { struct ocfs2_lock_res_ops *ops; @@ -319,9 +378,73 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, break; }; - ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, - OCFS2_I(inode)->ip_blkno, - inode->i_generation, ops, inode); + ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno, + generation, res->l_name); + ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode); +} + +static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres) +{ + struct inode *inode = ocfs2_lock_res_inode(lockres); + + return OCFS2_SB(inode->i_sb); +} + +static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres) +{ + __be64 inode_blkno_be; + + memcpy(&inode_blkno_be, &lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], + sizeof(__be64)); + + return be64_to_cpu(inode_blkno_be); +} + +static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_dentry_lock *dl = lockres->l_priv; + + return OCFS2_SB(dl->dl_inode->i_sb); +} + +void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, + u64 parent, struct inode *inode) +{ + int len; + u64 inode_blkno = OCFS2_I(inode)->ip_blkno; + __be64 inode_blkno_be = cpu_to_be64(inode_blkno); + struct ocfs2_lock_res *lockres = &dl->dl_lockres; + + ocfs2_lock_res_init_once(lockres); + + /* + * Unfortunately, the standard lock naming scheme won't work + * here because we have two 16 byte values to use. Instead, + * we'll stuff the inode number as a binary value. We still + * want error prints to show something without garbling the + * display, so drop a null byte in there before the inode + * number. A future version of OCFS2 will likely use all + * binary lock names. The stringified names have been a + * tremendous aid in debugging, but now that the debugfs + * interface exists, we can mangle things there if need be. + * + * NOTE: We also drop the standard "pad" value (the total lock + * name size stays the same though - the last part is all + * zeros due to the memset in ocfs2_lock_res_init_once() + */ + len = snprintf(lockres->l_name, OCFS2_DENTRY_LOCK_INO_START, + "%c%016llx", + ocfs2_lock_type_char(OCFS2_LOCK_TYPE_DENTRY), + (long long)parent); + + BUG_ON(len != (OCFS2_DENTRY_LOCK_INO_START - 1)); + + memcpy(&lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], &inode_blkno_be, + sizeof(__be64)); + + ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres, + OCFS2_LOCK_TYPE_DENTRY, &ocfs2_dentry_lops, + dl); } static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res, @@ -330,8 +453,9 @@ static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res, /* Superblock lockres doesn't come from a slab so we call init * once on it manually. */ ocfs2_lock_res_init_once(res); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_SUPER, OCFS2_SUPER_BLOCK_BLKNO, + 0, res->l_name); ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER, - OCFS2_SUPER_BLOCK_BLKNO, 0, &ocfs2_super_lops, osb); } @@ -341,7 +465,8 @@ static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, /* Rename lockres doesn't come from a slab so we call init * once on it manually. */ ocfs2_lock_res_init_once(res); - ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0, + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_RENAME, 0, 0, res->l_name); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, &ocfs2_rename_lops, osb); } @@ -495,7 +620,8 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo * information is already up to data. Convert from NL to * *anything* however should mark ourselves as needing an * update */ - if (lockres->l_level == LKM_NLMODE) + if (lockres->l_level == LKM_NLMODE && + lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); lockres->l_level = lockres->l_requested; @@ -512,7 +638,8 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); if (lockres->l_requested > LKM_NLMODE && - !(lockres->l_flags & OCFS2_LOCK_LOCAL)) + !(lockres->l_flags & OCFS2_LOCK_LOCAL) && + lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); lockres->l_level = lockres->l_requested; @@ -522,68 +649,6 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc mlog_exit_void(); } -static void ocfs2_inode_ast_func(void *opaque) -{ - struct ocfs2_lock_res *lockres = opaque; - struct inode *inode; - struct dlm_lockstatus *lksb; - unsigned long flags; - - mlog_entry_void(); - - inode = ocfs2_lock_res_inode(lockres); - - mlog(0, "AST fired for inode %llu, l_action = %u, type = %s\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, lockres->l_action, - ocfs2_lock_type_string(lockres->l_type)); - - BUG_ON(!ocfs2_is_inode_lock(lockres)); - - spin_lock_irqsave(&lockres->l_lock, flags); - - lksb = &(lockres->l_lksb); - if (lksb->status != DLM_NORMAL) { - mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u " - "on inode %llu\n", lksb->status, - (unsigned long long)OCFS2_I(inode)->ip_blkno); - spin_unlock_irqrestore(&lockres->l_lock, flags); - mlog_exit_void(); - return; - } - - switch(lockres->l_action) { - case OCFS2_AST_ATTACH: - ocfs2_generic_handle_attach_action(lockres); - lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL); - break; - case OCFS2_AST_CONVERT: - ocfs2_generic_handle_convert_action(lockres); - break; - case OCFS2_AST_DOWNCONVERT: - ocfs2_generic_handle_downconvert_action(lockres); - break; - default: - mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " - "lockres flags = 0x%lx, unlock action: %u\n", - lockres->l_name, lockres->l_action, lockres->l_flags, - lockres->l_unlock_action); - - BUG(); - } - - /* data and rw locking ignores refresh flag for now. */ - if (lockres->l_type != OCFS2_LOCK_TYPE_META) - lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); - - /* set it to something invalid so if we get called again we - * can catch it. */ - lockres->l_action = OCFS2_AST_INVALID; - spin_unlock_irqrestore(&lockres->l_lock, flags); - wake_up(&lockres->l_event); - - mlog_exit_void(); -} - static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level) { @@ -610,54 +675,33 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, return needs_downconvert; } -static void ocfs2_generic_bast_func(struct ocfs2_super *osb, - struct ocfs2_lock_res *lockres, - int level) +static void ocfs2_blocking_ast(void *opaque, int level) { + struct ocfs2_lock_res *lockres = opaque; + struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); int needs_downconvert; unsigned long flags; - mlog_entry_void(); - BUG_ON(level <= LKM_NLMODE); + mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", + lockres->l_name, level, lockres->l_level, + ocfs2_lock_type_string(lockres->l_type)); + spin_lock_irqsave(&lockres->l_lock, flags); needs_downconvert = ocfs2_generic_handle_bast(lockres, level); if (needs_downconvert) ocfs2_schedule_blocked_lock(osb, lockres); spin_unlock_irqrestore(&lockres->l_lock, flags); - ocfs2_kick_vote_thread(osb); - wake_up(&lockres->l_event); - mlog_exit_void(); -} - -static void ocfs2_inode_bast_func(void *opaque, int level) -{ - struct ocfs2_lock_res *lockres = opaque; - struct inode *inode; - struct ocfs2_super *osb; - mlog_entry_void(); - - BUG_ON(!ocfs2_is_inode_lock(lockres)); - - inode = ocfs2_lock_res_inode(lockres); - osb = OCFS2_SB(inode->i_sb); - - mlog(0, "BAST fired for inode %llu, blocking %d, level %d type %s\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, level, - lockres->l_level, ocfs2_lock_type_string(lockres->l_type)); - - ocfs2_generic_bast_func(osb, lockres, level); - - mlog_exit_void(); + ocfs2_kick_vote_thread(osb); } -static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres, - int ignore_refresh) +static void ocfs2_locking_ast(void *opaque) { + struct ocfs2_lock_res *lockres = opaque; struct dlm_lockstatus *lksb = &lockres->l_lksb; unsigned long flags; @@ -673,6 +717,7 @@ static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres, switch(lockres->l_action) { case OCFS2_AST_ATTACH: ocfs2_generic_handle_attach_action(lockres); + lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL); break; case OCFS2_AST_CONVERT: ocfs2_generic_handle_convert_action(lockres); @@ -681,80 +726,19 @@ static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres, ocfs2_generic_handle_downconvert_action(lockres); break; default: + mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " + "lockres flags = 0x%lx, unlock action: %u\n", + lockres->l_name, lockres->l_action, lockres->l_flags, + lockres->l_unlock_action); BUG(); } - if (ignore_refresh) - lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); - /* set it to something invalid so if we get called again we * can catch it. */ lockres->l_action = OCFS2_AST_INVALID; - spin_unlock_irqrestore(&lockres->l_lock, flags); wake_up(&lockres->l_event); -} - -static void ocfs2_super_ast_func(void *opaque) -{ - struct ocfs2_lock_res *lockres = opaque; - - mlog_entry_void(); - mlog(0, "Superblock AST fired\n"); - - BUG_ON(!ocfs2_is_super_lock(lockres)); - ocfs2_generic_ast_func(lockres, 0); - - mlog_exit_void(); -} - -static void ocfs2_super_bast_func(void *opaque, - int level) -{ - struct ocfs2_lock_res *lockres = opaque; - struct ocfs2_super *osb; - - mlog_entry_void(); - mlog(0, "Superblock BAST fired\n"); - - BUG_ON(!ocfs2_is_super_lock(lockres)); - osb = ocfs2_lock_res_super(lockres); - ocfs2_generic_bast_func(osb, lockres, level); - - mlog_exit_void(); -} - -static void ocfs2_rename_ast_func(void *opaque) -{ - struct ocfs2_lock_res *lockres = opaque; - - mlog_entry_void(); - - mlog(0, "Rename AST fired\n"); - - BUG_ON(!ocfs2_is_rename_lock(lockres)); - - ocfs2_generic_ast_func(lockres, 1); - - mlog_exit_void(); -} - -static void ocfs2_rename_bast_func(void *opaque, - int level) -{ - struct ocfs2_lock_res *lockres = opaque; - struct ocfs2_super *osb; - - mlog_entry_void(); - - mlog(0, "Rename BAST fired\n"); - - BUG_ON(!ocfs2_is_rename_lock(lockres)); - - osb = ocfs2_lock_res_super(lockres); - ocfs2_generic_bast_func(osb, lockres, level); - - mlog_exit_void(); + spin_unlock_irqrestore(&lockres->l_lock, flags); } static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, @@ -810,9 +794,10 @@ static int ocfs2_lock_create(struct ocfs2_super *osb, &lockres->l_lksb, dlm_flags, lockres->l_name, - lockres->l_ops->ast, + OCFS2_LOCK_ID_MAX_LEN - 1, + ocfs2_locking_ast, lockres, - lockres->l_ops->bast); + ocfs2_blocking_ast); if (status != DLM_NORMAL) { ocfs2_log_dlm_error("dlmlock", status, lockres); ret = -EINVAL; @@ -930,6 +915,9 @@ static int ocfs2_cluster_lock(struct ocfs2_super *osb, ocfs2_init_mask_waiter(&mw); + if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) + lkm_flags |= LKM_VALBLK; + again: wait = 0; @@ -997,11 +985,12 @@ again: status = dlmlock(osb->dlm, level, &lockres->l_lksb, - lkm_flags|LKM_CONVERT|LKM_VALBLK, + lkm_flags|LKM_CONVERT, lockres->l_name, - lockres->l_ops->ast, + OCFS2_LOCK_ID_MAX_LEN - 1, + ocfs2_locking_ast, lockres, - lockres->l_ops->bast); + ocfs2_blocking_ast); if (status != DLM_NORMAL) { if ((lkm_flags & LKM_NOQUEUE) && (status == DLM_NOTQUEUED)) @@ -1074,18 +1063,21 @@ static void ocfs2_cluster_unlock(struct ocfs2_super *osb, mlog_exit_void(); } -static int ocfs2_create_new_inode_lock(struct inode *inode, - struct ocfs2_lock_res *lockres) +int ocfs2_create_new_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int ex, + int local) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int level = ex ? LKM_EXMODE : LKM_PRMODE; unsigned long flags; + int lkm_flags = local ? LKM_LOCAL : 0; spin_lock_irqsave(&lockres->l_lock, flags); BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); spin_unlock_irqrestore(&lockres->l_lock, flags); - return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL); + return ocfs2_lock_create(osb, lockres, level, lkm_flags); } /* Grants us an EX lock on the data and metadata resources, skipping @@ -1097,6 +1089,7 @@ static int ocfs2_create_new_inode_lock(struct inode *inode, int ocfs2_create_new_inode_locks(struct inode *inode) { int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!inode); BUG_ON(!ocfs2_inode_is_new(inode)); @@ -1113,22 +1106,23 @@ int ocfs2_create_new_inode_locks(struct inode *inode) * on a resource which has an invalid one -- we'll set it * valid when we release the EX. */ - ret = ocfs2_create_new_inode_lock(inode, - &OCFS2_I(inode)->ip_rw_lockres); + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1); if (ret) { mlog_errno(ret); goto bail; } - ret = ocfs2_create_new_inode_lock(inode, - &OCFS2_I(inode)->ip_meta_lockres); + /* + * We don't want to use LKM_LOCAL on a meta data lock as they + * don't use a generation in their lock names. + */ + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_meta_lockres, 1, 0); if (ret) { mlog_errno(ret); goto bail; } - ret = ocfs2_create_new_inode_lock(inode, - &OCFS2_I(inode)->ip_data_lockres); + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_data_lockres, 1, 1); if (ret) { mlog_errno(ret); goto bail; @@ -1317,7 +1311,17 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; - lvb->lvb_version = cpu_to_be32(OCFS2_LVB_VERSION); + /* + * Invalidate the LVB of a deleted inode - this way other + * nodes are forced to go to disk and discover the new inode + * status. + */ + if (oi->ip_flags & OCFS2_INODE_DELETED) { + lvb->lvb_version = 0; + goto out; + } + + lvb->lvb_version = OCFS2_LVB_VERSION; lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); lvb->lvb_iuid = cpu_to_be32(inode->i_uid); @@ -1331,7 +1335,9 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); + lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); +out: mlog_meta_lvb(0, lockres); mlog_exit_void(); @@ -1386,11 +1392,13 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) mlog_exit_void(); } -static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres) +static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, + struct ocfs2_lock_res *lockres) { struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; - if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION) + if (lvb->lvb_version == OCFS2_LVB_VERSION + && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) return 1; return 0; } @@ -1487,7 +1495,7 @@ static int ocfs2_meta_lock_update(struct inode *inode, * map (directories, bitmap files, etc) */ ocfs2_extent_map_trunc(inode, 0); - if (ocfs2_meta_lvb_is_trustable(lockres)) { + if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { mlog(0, "Trusting LVB on inode %llu\n", (unsigned long long)oi->ip_blkno); ocfs2_refresh_inode_from_lvb(inode); @@ -1628,6 +1636,18 @@ int ocfs2_meta_lock_full(struct inode *inode, wait_event(osb->recovery_event, ocfs2_node_map_is_empty(osb, &osb->recovery_map)); + /* + * We only see this flag if we're being called from + * ocfs2_read_locked_inode(). It means we're locking an inode + * which hasn't been populated yet, so clear the refresh flag + * and let the caller handle it. + */ + if (inode->i_state & I_NEW) { + status = 0; + ocfs2_complete_lock_res_refresh(lockres, 0); + goto bail; + } + /* This is fun. The caller may want a bh back, or it may * not. ocfs2_meta_lock_update definitely wants one in, but * may or may not read one, depending on what's in the @@ -1807,6 +1827,34 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb) ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); } +int ocfs2_dentry_lock(struct dentry *dentry, int ex) +{ + int ret; + int level = ex ? LKM_EXMODE : LKM_PRMODE; + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + + BUG_ON(!dl); + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0); + if (ret < 0) + mlog_errno(ret); + + return ret; +} + +void ocfs2_dentry_unlock(struct dentry *dentry, int ex) +{ + int level = ex ? LKM_EXMODE : LKM_PRMODE; + struct ocfs2_dentry_lock *dl = dentry->d_fsdata; + struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + + ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); +} + /* Reference counting of the dlm debug structure. We want this because * open references on the debug inodes can live on after a mount, so * we can't rely on the ocfs2_super to always exist. */ @@ -1937,9 +1985,16 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) if (!lockres) return -EINVAL; - seq_printf(m, "0x%x\t" - "%.*s\t" - "%d\t" + seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION); + + if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY) + seq_printf(m, "%.*s%08x\t", OCFS2_DENTRY_LOCK_INO_START - 1, + lockres->l_name, + (unsigned int)ocfs2_get_dentry_lock_ino(lockres)); + else + seq_printf(m, "%.*s\t", OCFS2_LOCK_ID_MAX_LEN, lockres->l_name); + + seq_printf(m, "%d\t" "0x%lx\t" "0x%x\t" "0x%x\t" @@ -1947,8 +2002,6 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) "%u\t" "%d\t" "%d\t", - OCFS2_DLM_DEBUG_STR_VERSION, - OCFS2_LOCK_ID_MAX_LEN, lockres->l_name, lockres->l_level, lockres->l_flags, lockres->l_action, @@ -1999,7 +2052,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) mlog_errno(ret); goto out; } - osb = (struct ocfs2_super *) inode->u.generic_ip; + osb = inode->i_private; ocfs2_get_dlm_debug(osb->osb_dlm_debug); priv->p_dlm_debug = osb->osb_dlm_debug; INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); @@ -2138,7 +2191,7 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb) mlog_exit_void(); } -static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status) +static void ocfs2_unlock_ast(void *opaque, enum dlm_status status) { struct ocfs2_lock_res *lockres = opaque; unsigned long flags; @@ -2194,24 +2247,20 @@ complete_unlock: mlog_exit_void(); } -typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *); - -struct drop_lock_cb { - ocfs2_pre_drop_cb_t *drop_func; - void *drop_data; -}; - static int ocfs2_drop_lock(struct ocfs2_super *osb, - struct ocfs2_lock_res *lockres, - struct drop_lock_cb *dcb) + struct ocfs2_lock_res *lockres) { enum dlm_status status; unsigned long flags; + int lkm_flags = 0; /* We didn't get anywhere near actually using this lockres. */ if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) goto out; + if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) + lkm_flags |= LKM_VALBLK; + spin_lock_irqsave(&lockres->l_lock, flags); mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING), @@ -2234,8 +2283,12 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb, spin_lock_irqsave(&lockres->l_lock, flags); } - if (dcb) - dcb->drop_func(lockres, dcb->drop_data); + if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { + if (lockres->l_flags & OCFS2_LOCK_ATTACHED && + lockres->l_level == LKM_EXMODE && + !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) + lockres->l_ops->set_lvb(lockres); + } if (lockres->l_flags & OCFS2_LOCK_BUSY) mlog(ML_ERROR, "destroying busy lock: \"%s\"\n", @@ -2261,8 +2314,8 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb, mlog(0, "lock %s\n", lockres->l_name); - status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK, - lockres->l_ops->unlock_ast, lockres); + status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags, + ocfs2_unlock_ast, lockres); if (status != DLM_NORMAL) { ocfs2_log_dlm_error("dlmunlock", status, lockres); mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); @@ -2309,43 +2362,26 @@ void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) spin_unlock_irqrestore(&lockres->l_lock, flags); } -static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) +void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) { - int status; - - mlog_entry_void(); - - ocfs2_mark_lockres_freeing(&osb->osb_super_lockres); - - status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL); - if (status < 0) - mlog_errno(status); - - ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres); - - status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL); - if (status < 0) - mlog_errno(status); + int ret; - mlog_exit(status); + ocfs2_mark_lockres_freeing(lockres); + ret = ocfs2_drop_lock(osb, lockres); + if (ret) + mlog_errno(ret); } -static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data) +static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) { - struct inode *inode = data; - - /* the metadata lock requires a bit more work as we have an - * LVB to worry about. */ - if (lockres->l_flags & OCFS2_LOCK_ATTACHED && - lockres->l_level == LKM_EXMODE && - !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) - __ocfs2_stuff_meta_lvb(inode); + ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres); + ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres); } int ocfs2_drop_inode_locks(struct inode *inode) { int status, err; - struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, }; mlog_entry_void(); @@ -2353,24 +2389,21 @@ int ocfs2_drop_inode_locks(struct inode *inode) * ocfs2_clear_inode has done it for us. */ err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), - &OCFS2_I(inode)->ip_data_lockres, - NULL); + &OCFS2_I(inode)->ip_data_lockres); if (err < 0) mlog_errno(err); status = err; err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), - &OCFS2_I(inode)->ip_meta_lockres, - &meta_dcb); + &OCFS2_I(inode)->ip_meta_lockres); if (err < 0) mlog_errno(err); if (err < 0 && !status) status = err; err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), - &OCFS2_I(inode)->ip_rw_lockres, - NULL); + &OCFS2_I(inode)->ip_rw_lockres); if (err < 0) mlog_errno(err); if (err < 0 && !status) @@ -2419,9 +2452,10 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, &lockres->l_lksb, dlm_flags, lockres->l_name, - lockres->l_ops->ast, + OCFS2_LOCK_ID_MAX_LEN - 1, + ocfs2_locking_ast, lockres, - lockres->l_ops->bast); + ocfs2_blocking_ast); if (status != DLM_NORMAL) { ocfs2_log_dlm_error("dlmlock", status, lockres); ret = -EINVAL; @@ -2480,7 +2514,7 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb, status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_CANCEL, - lockres->l_ops->unlock_ast, + ocfs2_unlock_ast, lockres); if (status != DLM_NORMAL) { ocfs2_log_dlm_error("dlmunlock", status, lockres); @@ -2494,115 +2528,15 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb, return ret; } -static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, - struct ocfs2_lock_res *lockres, - int new_level) -{ - int ret; - - mlog_entry_void(); - - BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); - - if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { - ret = 0; - mlog(0, "lockres %s currently being refreshed -- backing " - "off!\n", lockres->l_name); - } else if (new_level == LKM_PRMODE) - ret = !lockres->l_ex_holders && - ocfs2_inode_fully_checkpointed(inode); - else /* Must be NLMODE we're converting to. */ - ret = !lockres->l_ro_holders && !lockres->l_ex_holders && - ocfs2_inode_fully_checkpointed(inode); - - mlog_exit(ret); - return ret; -} - -static int ocfs2_do_unblock_meta(struct inode *inode, - int *requeue) -{ - int new_level; - int set_lvb = 0; - int ret = 0; - struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; - unsigned long flags; - - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - mlog_entry_void(); - - spin_lock_irqsave(&lockres->l_lock, flags); - - BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); - - mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level, - lockres->l_blocking); - - BUG_ON(lockres->l_level != LKM_EXMODE && - lockres->l_level != LKM_PRMODE); - - if (lockres->l_flags & OCFS2_LOCK_BUSY) { - *requeue = 1; - ret = ocfs2_prepare_cancel_convert(osb, lockres); - spin_unlock_irqrestore(&lockres->l_lock, flags); - if (ret) { - ret = ocfs2_cancel_convert(osb, lockres); - if (ret < 0) - mlog_errno(ret); - } - goto leave; - } - - new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); - - mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n", - lockres->l_level, lockres->l_blocking, new_level); - - if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) { - if (lockres->l_level == LKM_EXMODE) - set_lvb = 1; - - /* If the lock hasn't been refreshed yet (rare), then - * our memory inode values are old and we skip - * stuffing the lvb. There's no need to actually clear - * out the lvb here as it's value is still valid. */ - if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { - if (set_lvb) - __ocfs2_stuff_meta_lvb(inode); - } else - mlog(0, "lockres %s: downconverting stale lock!\n", - lockres->l_name); - - mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, " - "l_blocking=%d, new_level=%d\n", - lockres->l_level, lockres->l_blocking, new_level); - - ocfs2_prepare_downconvert(lockres, new_level); - spin_unlock_irqrestore(&lockres->l_lock, flags); - ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); - goto leave; - } - if (!ocfs2_inode_fully_checkpointed(inode)) - ocfs2_start_checkpoint(osb); - - *requeue = 1; - spin_unlock_irqrestore(&lockres->l_lock, flags); - ret = 0; -leave: - mlog_exit(ret); - return ret; -} - -static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, - struct ocfs2_lock_res *lockres, - int *requeue, - ocfs2_convert_worker_t *worker) +static int ocfs2_unblock_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + struct ocfs2_unblock_ctl *ctl) { unsigned long flags; int blocking; int new_level; int ret = 0; + int set_lvb = 0; mlog_entry_void(); @@ -2612,7 +2546,7 @@ static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, recheck: if (lockres->l_flags & OCFS2_LOCK_BUSY) { - *requeue = 1; + ctl->requeue = 1; ret = ocfs2_prepare_cancel_convert(osb, lockres); spin_unlock_irqrestore(&lockres->l_lock, flags); if (ret) { @@ -2626,27 +2560,33 @@ recheck: /* if we're blocking an exclusive and we have *any* holders, * then requeue. */ if ((lockres->l_blocking == LKM_EXMODE) - && (lockres->l_ex_holders || lockres->l_ro_holders)) { - spin_unlock_irqrestore(&lockres->l_lock, flags); - *requeue = 1; - ret = 0; - goto leave; - } + && (lockres->l_ex_holders || lockres->l_ro_holders)) + goto leave_requeue; /* If it's a PR we're blocking, then only * requeue if we've got any EX holders */ if (lockres->l_blocking == LKM_PRMODE && - lockres->l_ex_holders) { - spin_unlock_irqrestore(&lockres->l_lock, flags); - *requeue = 1; - ret = 0; - goto leave; - } + lockres->l_ex_holders) + goto leave_requeue; + + /* + * Can we get a lock in this state if the holder counts are + * zero? The meta data unblock code used to check this. + */ + if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) + && (lockres->l_flags & OCFS2_LOCK_REFRESHING)) + goto leave_requeue; + + new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); + + if (lockres->l_ops->check_downconvert + && !lockres->l_ops->check_downconvert(lockres, new_level)) + goto leave_requeue; /* If we get here, then we know that there are no more * incompatible holders (and anyone asking for an incompatible * lock is blocked). We can now downconvert the lock */ - if (!worker) + if (!lockres->l_ops->downconvert_worker) goto downconvert; /* Some lockres types want to do a bit of work before @@ -2656,7 +2596,10 @@ recheck: blocking = lockres->l_blocking; spin_unlock_irqrestore(&lockres->l_lock, flags); - worker(lockres, blocking); + ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking); + + if (ctl->unblock_action == UNBLOCK_STOP_POST) + goto leave; spin_lock_irqsave(&lockres->l_lock, flags); if (blocking != lockres->l_blocking) { @@ -2666,25 +2609,43 @@ recheck: } downconvert: - *requeue = 0; - new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); + ctl->requeue = 0; + + if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { + if (lockres->l_level == LKM_EXMODE) + set_lvb = 1; + + /* + * We only set the lvb if the lock has been fully + * refreshed - otherwise we risk setting stale + * data. Otherwise, there's no need to actually clear + * out the lvb here as it's value is still valid. + */ + if (set_lvb && !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) + lockres->l_ops->set_lvb(lockres); + } ocfs2_prepare_downconvert(lockres, new_level); spin_unlock_irqrestore(&lockres->l_lock, flags); - ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0); + ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); leave: mlog_exit(ret); return ret; + +leave_requeue: + spin_unlock_irqrestore(&lockres->l_lock, flags); + ctl->requeue = 1; + + mlog_exit(0); + return 0; } -static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, - int blocking) +static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) { struct inode *inode; struct address_space *mapping; - mlog_entry_void(); - inode = ocfs2_lock_res_inode(lockres); mapping = inode->i_mapping; @@ -2705,116 +2666,159 @@ static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, filemap_fdatawait(mapping); } - mlog_exit_void(); + return UNBLOCK_CONTINUE; } -int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, - int *requeue) +static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, + int new_level) { - int status; - struct inode *inode; - struct ocfs2_super *osb; - - mlog_entry_void(); - - inode = ocfs2_lock_res_inode(lockres); - osb = OCFS2_SB(inode->i_sb); - - mlog(0, "unblock inode %llu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno); + struct inode *inode = ocfs2_lock_res_inode(lockres); + int checkpointed = ocfs2_inode_fully_checkpointed(inode); - status = ocfs2_generic_unblock_lock(osb, - lockres, - requeue, - ocfs2_data_convert_worker); - if (status < 0) - mlog_errno(status); + BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); + BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed); - mlog(0, "inode %llu, requeue = %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue); + if (checkpointed) + return 1; - mlog_exit(status); - return status; + ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb)); + return 0; } -static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, - int *requeue) +static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres) { - int status; - struct inode *inode; - - mlog_entry_void(); - - mlog(0, "Unblock lockres %s\n", lockres->l_name); - - inode = ocfs2_lock_res_inode(lockres); + struct inode *inode = ocfs2_lock_res_inode(lockres); - status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb), - lockres, - requeue, - NULL); - if (status < 0) - mlog_errno(status); - - mlog_exit(status); - return status; + __ocfs2_stuff_meta_lvb(inode); } - -int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, - int *requeue) +/* + * Does the final reference drop on our dentry lock. Right now this + * happens in the vote thread, but we could choose to simplify the + * dlmglue API and push these off to the ocfs2_wq in the future. + */ +static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) { - int status; - struct inode *inode; - - mlog_entry_void(); + struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres); + ocfs2_dentry_lock_put(osb, dl); +} - inode = ocfs2_lock_res_inode(lockres); +/* + * d_delete() matching dentries before the lock downconvert. + * + * At this point, any process waiting to destroy the + * dentry_lock due to last ref count is stopped by the + * OCFS2_LOCK_QUEUED flag. + * + * We have two potential problems + * + * 1) If we do the last reference drop on our dentry_lock (via dput) + * we'll wind up in ocfs2_release_dentry_lock(), waiting on + * the downconvert to finish. Instead we take an elevated + * reference and push the drop until after we've completed our + * unblock processing. + * + * 2) There might be another process with a final reference, + * waiting on us to finish processing. If this is the case, we + * detect it and exit out - there's no more dentries anyway. + */ +static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) +{ + struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres); + struct ocfs2_inode_info *oi = OCFS2_I(dl->dl_inode); + struct dentry *dentry; + unsigned long flags; + int extra_ref = 0; - mlog(0, "unblock inode %llu\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno); + /* + * This node is blocking another node from getting a read + * lock. This happens when we've renamed within a + * directory. We've forced the other nodes to d_delete(), but + * we never actually dropped our lock because it's still + * valid. The downconvert code will retain a PR for this node, + * so there's no further work to do. + */ + if (blocking == LKM_PRMODE) + return UNBLOCK_CONTINUE; - status = ocfs2_do_unblock_meta(inode, requeue); - if (status < 0) - mlog_errno(status); + /* + * Mark this inode as potentially orphaned. The code in + * ocfs2_delete_inode() will figure out whether it actually + * needs to be freed or not. + */ + spin_lock(&oi->ip_lock); + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); - mlog(0, "inode %llu, requeue = %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue); + /* + * Yuck. We need to make sure however that the check of + * OCFS2_LOCK_FREEING and the extra reference are atomic with + * respect to a reference decrement or the setting of that + * flag. + */ + spin_lock_irqsave(&lockres->l_lock, flags); + spin_lock(&dentry_attach_lock); + if (!(lockres->l_flags & OCFS2_LOCK_FREEING) + && dl->dl_count) { + dl->dl_count++; + extra_ref = 1; + } + spin_unlock(&dentry_attach_lock); + spin_unlock_irqrestore(&lockres->l_lock, flags); - mlog_exit(status); - return status; -} + mlog(0, "extra_ref = %d\n", extra_ref); -/* Generic unblock function for any lockres whose private data is an - * ocfs2_super pointer. */ -static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, - int *requeue) -{ - int status; - struct ocfs2_super *osb; + /* + * We have a process waiting on us in ocfs2_dentry_iput(), + * which means we can't have any more outstanding + * aliases. There's no need to do any more work. + */ + if (!extra_ref) + return UNBLOCK_CONTINUE; + + spin_lock(&dentry_attach_lock); + while (1) { + dentry = ocfs2_find_local_alias(dl->dl_inode, + dl->dl_parent_blkno, 1); + if (!dentry) + break; + spin_unlock(&dentry_attach_lock); - mlog_entry_void(); + mlog(0, "d_delete(%.*s);\n", dentry->d_name.len, + dentry->d_name.name); - mlog(0, "Unblock lockres %s\n", lockres->l_name); + /* + * The following dcache calls may do an + * iput(). Normally we don't want that from the + * downconverting thread, but in this case it's ok + * because the requesting node already has an + * exclusive lock on the inode, so it can't be queued + * for a downconvert. + */ + d_delete(dentry); + dput(dentry); - osb = ocfs2_lock_res_super(lockres); + spin_lock(&dentry_attach_lock); + } + spin_unlock(&dentry_attach_lock); - status = ocfs2_generic_unblock_lock(osb, - lockres, - requeue, - NULL); - if (status < 0) - mlog_errno(status); + /* + * If we are the last holder of this dentry lock, there is no + * reason to downconvert so skip straight to the unlock. + */ + if (dl->dl_count == 1) + return UNBLOCK_STOP_POST; - mlog_exit(status); - return status; + return UNBLOCK_CONTINUE_POST; } void ocfs2_process_blocked_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres) { int status; - int requeue = 0; + struct ocfs2_unblock_ctl ctl = {0, 0,}; unsigned long flags; /* Our reference to the lockres in this function can be @@ -2825,7 +2829,6 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb, BUG_ON(!lockres); BUG_ON(!lockres->l_ops); - BUG_ON(!lockres->l_ops->unblock); mlog(0, "lockres %s blocked.\n", lockres->l_name); @@ -2839,21 +2842,25 @@ void ocfs2_process_blocked_lock(struct ocfs2_super *osb, goto unqueue; spin_unlock_irqrestore(&lockres->l_lock, flags); - status = lockres->l_ops->unblock(lockres, &requeue); + status = ocfs2_unblock_lock(osb, lockres, &ctl); if (status < 0) mlog_errno(status); spin_lock_irqsave(&lockres->l_lock, flags); unqueue: - if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) { + if (lockres->l_flags & OCFS2_LOCK_FREEING || !ctl.requeue) { lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED); } else ocfs2_schedule_blocked_lock(osb, lockres); mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, - requeue ? "yes" : "no"); + ctl.requeue ? "yes" : "no"); spin_unlock_irqrestore(&lockres->l_lock, flags); + if (ctl.unblock_action != UNBLOCK_CONTINUE + && lockres->l_ops->post_unlock) + lockres->l_ops->post_unlock(osb, lockres); + mlog_exit_void(); } @@ -2896,8 +2903,9 @@ void ocfs2_dump_meta_lvb_info(u64 level, mlog(level, "LVB information for %s (called from %s:%u):\n", lockres->l_name, function, line); - mlog(level, "version: %u, clusters: %u\n", - be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters)); + mlog(level, "version: %u, clusters: %u, generation: 0x%x\n", + lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters), + be32_to_cpu(lvb->lvb_igeneration)); mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n", (unsigned long long)be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid), diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 243ae862ece5..4a2769387229 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -27,10 +27,14 @@ #ifndef DLMGLUE_H #define DLMGLUE_H -#define OCFS2_LVB_VERSION 3 +#include "dcache.h" + +#define OCFS2_LVB_VERSION 4 struct ocfs2_meta_lvb { - __be32 lvb_version; + __u8 lvb_version; + __u8 lvb_reserved0; + __be16 lvb_reserved1; __be32 lvb_iclusters; __be32 lvb_iuid; __be32 lvb_igid; @@ -41,7 +45,8 @@ struct ocfs2_meta_lvb { __be16 lvb_imode; __be16 lvb_inlink; __be32 lvb_iattr; - __be32 lvb_reserved[2]; + __be32 lvb_igeneration; + __be32 lvb_reserved2; }; /* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */ @@ -57,9 +62,14 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb); void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, enum ocfs2_lock_type type, + unsigned int generation, struct inode *inode); +void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, + u64 parent, struct inode *inode); void ocfs2_lock_res_free(struct ocfs2_lock_res *res); int ocfs2_create_new_inode_locks(struct inode *inode); +int ocfs2_create_new_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, int ex, int local); int ocfs2_drop_inode_locks(struct inode *inode); int ocfs2_data_lock_full(struct inode *inode, int write, @@ -93,7 +103,12 @@ void ocfs2_super_unlock(struct ocfs2_super *osb, int ex); int ocfs2_rename_lock(struct ocfs2_super *osb); void ocfs2_rename_unlock(struct ocfs2_super *osb); +int ocfs2_dentry_lock(struct dentry *dentry, int ex); +void ocfs2_dentry_unlock(struct dentry *dentry, int ex); + void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); +void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); /* for the vote thread */ void ocfs2_process_blocked_lock(struct ocfs2_super *osb, diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index ec55ab3c1214..fb91089a60a7 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -33,6 +33,7 @@ #include "dir.h" #include "dlmglue.h" +#include "dcache.h" #include "export.h" #include "inode.h" @@ -57,7 +58,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) return ERR_PTR(-ESTALE); } - inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno); + inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno, 0); if (IS_ERR(inode)) { mlog_errno(PTR_ERR(inode)); @@ -77,6 +78,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) mlog_errno(-ENOMEM); return ERR_PTR(-ENOMEM); } + result->d_op = &ocfs2_dentry_ops; mlog_exit_ptr(result); return result; @@ -113,7 +115,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) goto bail_unlock; } - inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno); + inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); if (IS_ERR(inode)) { mlog(ML_ERROR, "Unable to create inode %llu\n", (unsigned long long)blkno); @@ -127,6 +129,8 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) parent = ERR_PTR(-ENOMEM); } + parent->d_op = &ocfs2_dentry_ops; + bail_unlock: ocfs2_meta_unlock(dir, 0); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7bcf69154592..16e8e74dc966 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -54,8 +54,6 @@ #include "buffer_head_io.h" -#define OCFS2_FI_FLAG_NOWAIT 0x1 -#define OCFS2_FI_FLAG_DELETE 0x2 struct ocfs2_find_inode_args { u64 fi_blkno; @@ -109,7 +107,7 @@ struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); } -struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno) +struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) { struct inode *inode = NULL; struct super_block *sb = osb->sb; @@ -127,7 +125,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno) } args.fi_blkno = blkno; - args.fi_flags = 0; + args.fi_flags = flags; args.fi_ino = ino_from_blkno(sb, blkno); inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, @@ -271,7 +269,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_mode = le16_to_cpu(fe->i_mode); inode->i_uid = le32_to_cpu(fe->i_uid); inode->i_gid = le32_to_cpu(fe->i_gid); - inode->i_blksize = (u32)osb->s_clustersize; /* Fast symlinks will have i_size but no allocated clusters. */ if (S_ISLNK(inode->i_mode) && !fe->i_clusters) @@ -297,15 +294,11 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); - if (create_ino) - inode->i_ino = ino_from_blkno(inode->i_sb, - le64_to_cpu(fe->i_blkno)); - - mlog(0, "blkno = %llu, ino = %lu, create_ino = %s\n", - (unsigned long long)fe->i_blkno, inode->i_ino, create_ino ? "true" : "false"); - inode->i_nlink = le16_to_cpu(fe->i_links_count); + if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; + if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); @@ -343,12 +336,28 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, break; } + if (create_ino) { + inode->i_ino = ino_from_blkno(inode->i_sb, + le64_to_cpu(fe->i_blkno)); + + /* + * If we ever want to create system files from kernel, + * the generation argument to + * ocfs2_inode_lock_res_init() will have to change. + */ + BUG_ON(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)); + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, + OCFS2_LOCK_TYPE_META, 0, inode); + } + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, - OCFS2_LOCK_TYPE_RW, inode); - ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, - OCFS2_LOCK_TYPE_META, inode); + OCFS2_LOCK_TYPE_RW, inode->i_generation, + inode); + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres, - OCFS2_LOCK_TYPE_DATA, inode); + OCFS2_LOCK_TYPE_DATA, inode->i_generation, + inode); ocfs2_set_inode_flags(inode); inode->i_flags |= S_NOATIME; @@ -366,15 +375,15 @@ static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_super *osb; struct ocfs2_dinode *fe; struct buffer_head *bh = NULL; - int status; - int sysfile = 0; + int status, can_lock; + u32 generation = 0; mlog_entry("(0x%p, 0x%p)\n", inode, args); status = -EINVAL; if (inode == NULL || inode->i_sb == NULL) { mlog(ML_ERROR, "bad inode\n"); - goto bail; + return status; } sb = inode->i_sb; osb = OCFS2_SB(sb); @@ -382,50 +391,110 @@ static int ocfs2_read_locked_inode(struct inode *inode, if (!args) { mlog(ML_ERROR, "bad inode args\n"); make_bad_inode(inode); - goto bail; + return status; + } + + /* + * To improve performance of cold-cache inode stats, we take + * the cluster lock here if possible. + * + * Generally, OCFS2 never trusts the contents of an inode + * unless it's holding a cluster lock, so taking it here isn't + * a correctness issue as much as it is a performance + * improvement. + * + * There are three times when taking the lock is not a good idea: + * + * 1) During startup, before we have initialized the DLM. + * + * 2) If we are reading certain system files which never get + * cluster locks (local alloc, truncate log). + * + * 3) If the process doing the iget() is responsible for + * orphan dir recovery. We're holding the orphan dir lock and + * can get into a deadlock with another process on another + * node in ->delete_inode(). + * + * #1 and #2 can be simply solved by never taking the lock + * here for system files (which are the only type we read + * during mount). It's a heavier approach, but our main + * concern is user-accesible files anyway. + * + * #3 works itself out because we'll eventually take the + * cluster lock before trusting anything anyway. + */ + can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) + && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK); + + /* + * To maintain backwards compatibility with older versions of + * ocfs2-tools, we still store the generation value for system + * files. The only ones that actually matter to userspace are + * the journals, but it's easier and inexpensive to just flag + * all system files similarly. + */ + if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) + generation = osb->fs_generation; + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, + OCFS2_LOCK_TYPE_META, + generation, inode); + + if (can_lock) { + status = ocfs2_meta_lock(inode, NULL, NULL, 0); + if (status) { + make_bad_inode(inode); + mlog_errno(status); + return status; + } } - /* Read the FE off disk. This is safe because the kernel only - * does one read_inode2 for a new inode, and if it doesn't - * exist yet then nobody can be working on it! */ - status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL); + status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, + can_lock ? inode : NULL); if (status < 0) { mlog_errno(status); - make_bad_inode(inode); goto bail; } + status = -EINVAL; fe = (struct ocfs2_dinode *) bh->b_data; if (!OCFS2_IS_VALID_DINODE(fe)) { mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", (unsigned long long)fe->i_blkno, 7, fe->i_signature); - make_bad_inode(inode); goto bail; } - if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) - sysfile = 1; + /* + * This is a code bug. Right now the caller needs to + * understand whether it is asking for a system file inode or + * not so the proper lock names can be built. + */ + mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != + !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), + "Inode %llu: system file state is ambigous\n", + (unsigned long long)args->fi_blkno); if (S_ISCHR(le16_to_cpu(fe->i_mode)) || S_ISBLK(le16_to_cpu(fe->i_mode))) inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); - status = -EINVAL; if (ocfs2_populate_inode(inode, fe, 0) < 0) { mlog(ML_ERROR, "populate failed! i_blkno=%llu, i_ino=%lu\n", (unsigned long long)fe->i_blkno, inode->i_ino); - make_bad_inode(inode); goto bail; } BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); - if (sysfile) - OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; - status = 0; bail: + if (can_lock) + ocfs2_meta_unlock(inode, 0); + + if (status < 0) + make_bad_inode(inode); + if (args && bh) brelse(bh); @@ -898,9 +967,15 @@ void ocfs2_delete_inode(struct inode *inode) goto bail_unlock_inode; } - /* Mark the inode as successfully deleted. This is important - * for ocfs2_clear_inode as it will check this flag and skip - * any checkpointing work */ + /* + * Mark the inode as successfully deleted. + * + * This is important for ocfs2_clear_inode() as it will check + * this flag and skip any checkpointing work + * + * ocfs2_stuff_meta_lvb() also uses this flag to invalidate + * the LVB for other nodes. + */ OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; bail_unlock_inode: @@ -1025,12 +1100,10 @@ void ocfs2_drop_inode(struct inode *inode) /* Testing ip_orphaned_slot here wouldn't work because we may * not have gotten a delete_inode vote from any other nodes * yet. */ - if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) { - mlog(0, "Inode was orphaned on another node, clearing nlink.\n"); - inode->i_nlink = 0; - } - - generic_drop_inode(inode); + if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) + generic_delete_inode(inode); + else + generic_drop_inode(inode); mlog_exit_void(); } @@ -1184,8 +1257,6 @@ leave: void ocfs2_refresh_inode(struct inode *inode, struct ocfs2_dinode *fe) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); @@ -1196,7 +1267,6 @@ void ocfs2_refresh_inode(struct inode *inode, inode->i_uid = le32_to_cpu(fe->i_uid); inode->i_gid = le32_to_cpu(fe->i_gid); inode->i_mode = le16_to_cpu(fe->i_mode); - inode->i_blksize = (u32) osb->s_clustersize; if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) inode->i_blocks = 0; else diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 4d1e53992566..9957810fdf85 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -122,7 +122,13 @@ struct buffer_head *ocfs2_bread(struct inode *inode, int block, void ocfs2_clear_inode(struct inode *inode); void ocfs2_delete_inode(struct inode *inode); void ocfs2_drop_inode(struct inode *inode); -struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff); + +/* Flags for ocfs2_iget() */ +#define OCFS2_FI_FLAG_NOWAIT 0x1 +#define OCFS2_FI_FLAG_DELETE 0x2 +#define OCFS2_FI_FLAG_SYSFILE 0x4 +#define OCFS2_FI_FLAG_NOLOCK 0x8 +struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, u64 blkno, int delete_vote); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f92bf1dd379a..fd9734def551 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1493,7 +1493,8 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, if (de->name_len == 2 && !strncmp("..", de->name, 2)) continue; - iter = ocfs2_iget(osb, le64_to_cpu(de->inode)); + iter = ocfs2_iget(osb, le64_to_cpu(de->inode), + OCFS2_FI_FLAG_NOLOCK); if (IS_ERR(iter)) continue; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 0d3e939b1f56..849c3b4bb94a 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -179,7 +179,7 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, if (status < 0) goto bail_add; - inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno); + inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); if (IS_ERR(inode)) { mlog(ML_ERROR, "Unable to create inode %llu\n", (unsigned long long)blkno); @@ -199,10 +199,32 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, spin_unlock(&oi->ip_lock); bail_add: - dentry->d_op = &ocfs2_dentry_ops; ret = d_splice_alias(inode, dentry); + if (inode) { + /* + * If d_splice_alias() finds a DCACHE_DISCONNECTED + * dentry, it will d_move() it on top of ourse. The + * return value will indicate this however, so in + * those cases, we switch them around for the locking + * code. + * + * NOTE: This dentry already has ->d_op set from + * ocfs2_get_parent() and ocfs2_get_dentry() + */ + if (ret) + dentry = ret; + + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + ret = ERR_PTR(status); + goto bail_unlock; + } + } + bail_unlock: /* Don't drop the cluster lock until *after* the d_add -- * unlink on another node will message us to remove that @@ -418,6 +440,13 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + goto leave; + } + insert_inode_hash(inode); dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); @@ -725,6 +754,12 @@ static int ocfs2_link(struct dentry *old_dentry, goto bail; } + err = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); + if (err) { + mlog_errno(err); + goto bail; + } + atomic_inc(&inode->i_count); dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); @@ -743,6 +778,23 @@ bail: return err; } +/* + * Takes and drops an exclusive lock on the given dentry. This will + * force other nodes to drop it. + */ +static int ocfs2_remote_dentry_delete(struct dentry *dentry) +{ + int ret; + + ret = ocfs2_dentry_lock(dentry, 1); + if (ret) + mlog_errno(ret); + else + ocfs2_dentry_unlock(dentry, 1); + + return ret; +} + static int ocfs2_unlink(struct inode *dir, struct dentry *dentry) { @@ -832,8 +884,7 @@ static int ocfs2_unlink(struct inode *dir, else inode->i_nlink--; - status = ocfs2_request_unlink_vote(inode, dentry, - (unsigned int) inode->i_nlink); + status = ocfs2_remote_dentry_delete(dentry); if (status < 0) { /* This vote should succeed under all normal * circumstances. */ @@ -1019,7 +1070,6 @@ static int ocfs2_rename(struct inode *old_dir, struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, // this is the 1st dirent bh nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink; - unsigned int links_count; /* At some point it might be nice to break this function up a * bit. */ @@ -1093,23 +1143,26 @@ static int ocfs2_rename(struct inode *old_dir, } } - if (S_ISDIR(old_inode->i_mode)) { - /* Directories actually require metadata updates to - * the directory info so we can't get away with not - * doing node locking on it. */ - status = ocfs2_meta_lock(old_inode, handle, NULL, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - goto bail; - } - - status = ocfs2_request_rename_vote(old_inode, old_dentry); - if (status < 0) { + /* + * Though we don't require an inode meta data update if + * old_inode is not a directory, we lock anyway here to ensure + * the vote thread on other nodes won't have to concurrently + * downconvert the inode and the dentry locks. + */ + status = ocfs2_meta_lock(old_inode, handle, NULL, 1); + if (status < 0) { + if (status != -ENOENT) mlog_errno(status); - goto bail; - } + goto bail; + } + + status = ocfs2_remote_dentry_delete(old_dentry); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (S_ISDIR(old_inode->i_mode)) { status = -EIO; old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0); if (!old_inode_de_bh) @@ -1123,14 +1176,6 @@ static int ocfs2_rename(struct inode *old_dir, if (!new_inode && new_dir!=old_dir && new_dir->i_nlink >= OCFS2_LINK_MAX) goto bail; - } else { - /* Ah, the simple case - we're a file so just send a - * message. */ - status = ocfs2_request_rename_vote(old_inode, old_dentry); - if (status < 0) { - mlog_errno(status); - goto bail; - } } status = -ENOENT; @@ -1202,13 +1247,7 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } - if (S_ISDIR(new_inode->i_mode)) - links_count = 0; - else - links_count = (unsigned int) (new_inode->i_nlink - 1); - - status = ocfs2_request_unlink_vote(new_inode, new_dentry, - links_count); + status = ocfs2_remote_dentry_delete(new_dentry); if (status < 0) { mlog_errno(status); goto bail; @@ -1387,6 +1426,7 @@ static int ocfs2_rename(struct inode *old_dir, } } + ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir); status = 0; bail: if (rename_lock) @@ -1675,6 +1715,12 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } + status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); + if (status) { + mlog_errno(status); + goto bail; + } + insert_inode_hash(inode); dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 7dd9e1e705b0..4d5d5655c185 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -35,12 +35,15 @@ #define OCFS2_LOCK_ID_MAX_LEN 32 #define OCFS2_LOCK_ID_PAD "000000" +#define OCFS2_DENTRY_LOCK_INO_START 18 + enum ocfs2_lock_type { OCFS2_LOCK_TYPE_META = 0, OCFS2_LOCK_TYPE_DATA, OCFS2_LOCK_TYPE_SUPER, OCFS2_LOCK_TYPE_RENAME, OCFS2_LOCK_TYPE_RW, + OCFS2_LOCK_TYPE_DENTRY, OCFS2_NUM_LOCK_TYPES }; @@ -63,6 +66,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_RW: c = 'W'; break; + case OCFS2_LOCK_TYPE_DENTRY: + c = 'N'; + break; default: c = '\0'; } @@ -70,4 +76,23 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) return c; } +static char *ocfs2_lock_type_strings[] = { + [OCFS2_LOCK_TYPE_META] = "Meta", + [OCFS2_LOCK_TYPE_DATA] = "Data", + [OCFS2_LOCK_TYPE_SUPER] = "Super", + [OCFS2_LOCK_TYPE_RENAME] = "Rename", + /* Need to differntiate from [R]ename.. serializing writes is the + * important job it does, anyway. */ + [OCFS2_LOCK_TYPE_RW] = "Write/Read", + [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", +}; + +static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) +{ +#ifdef __KERNEL__ + mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); +#endif + return ocfs2_lock_type_strings[type]; +} + #endif /* OCFS2_LOCKID_H */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index d17e33e66a1e..4c29cd7cc8e6 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -202,7 +202,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) mlog_entry_void(); - new = ocfs2_iget(osb, osb->root_blkno); + new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE); if (IS_ERR(new)) { status = PTR_ERR(new); mlog_errno(status); @@ -210,7 +210,7 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) } osb->root_inode = new; - new = ocfs2_iget(osb, osb->system_dir_blkno); + new = ocfs2_iget(osb, osb->system_dir_blkno, OCFS2_FI_FLAG_SYSFILE); if (IS_ERR(new)) { status = PTR_ERR(new); mlog_errno(status); @@ -682,7 +682,7 @@ static struct file_system_type ocfs2_fs_type = { .kill_sb = kill_block_super, /* set to the generic one * right now, but do we * need to change that? */ - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index fc29cb7a437d..5df6e35d09b1 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -28,11 +28,11 @@ #include <linux/slab.h> #include <linux/highmem.h> -#include "ocfs2.h" - #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> +#include "ocfs2.h" + #include "alloc.h" #include "dir.h" #include "inode.h" @@ -115,7 +115,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, goto bail; } - inode = ocfs2_iget(osb, blkno); + inode = ocfs2_iget(osb, blkno, OCFS2_FI_FLAG_SYSFILE); if (IS_ERR(inode)) { mlog_errno(PTR_ERR(inode)); inode = NULL; diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index cf70fe2075b8..5b4dca79990b 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c @@ -74,9 +74,6 @@ struct ocfs2_vote_msg __be32 v_orphaned_slot; /* Used during delete votes */ __be32 v_nlink; /* Used during unlink votes */ } md1; /* Message type dependant 1 */ - __be32 v_unlink_namelen; - __be64 v_unlink_parent; - u8 v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN]; }; /* Responses are given these values to maintain backwards @@ -100,8 +97,6 @@ struct ocfs2_vote_work { enum ocfs2_vote_request { OCFS2_VOTE_REQ_INVALID = 0, OCFS2_VOTE_REQ_DELETE, - OCFS2_VOTE_REQ_UNLINK, - OCFS2_VOTE_REQ_RENAME, OCFS2_VOTE_REQ_MOUNT, OCFS2_VOTE_REQ_UMOUNT, OCFS2_VOTE_REQ_LAST @@ -261,103 +256,13 @@ done: return response; } -static int ocfs2_match_dentry(struct dentry *dentry, - u64 parent_blkno, - unsigned int namelen, - const char *name) -{ - struct inode *parent; - - if (!dentry->d_parent) { - mlog(0, "Detached from parent.\n"); - return 0; - } - - parent = dentry->d_parent->d_inode; - /* Negative parent dentry? */ - if (!parent) - return 0; - - /* Name is in a different directory. */ - if (OCFS2_I(parent)->ip_blkno != parent_blkno) - return 0; - - if (dentry->d_name.len != namelen) - return 0; - - /* comparison above guarantees this is safe. */ - if (memcmp(dentry->d_name.name, name, namelen)) - return 0; - - return 1; -} - -static void ocfs2_process_dentry_request(struct inode *inode, - int rename, - unsigned int new_nlink, - u64 parent_blkno, - unsigned int namelen, - const char *name) -{ - struct dentry *dentry = NULL; - struct list_head *p; - struct ocfs2_inode_info *oi = OCFS2_I(inode); - - mlog(0, "parent %llu, namelen = %u, name = %.*s\n", - (unsigned long long)parent_blkno, namelen, namelen, name); - - spin_lock(&dcache_lock); - - /* Another node is removing this name from the system. It is - * up to us to find the corresponding dentry and if it exists, - * unhash it from the dcache. */ - list_for_each(p, &inode->i_dentry) { - dentry = list_entry(p, struct dentry, d_alias); - - if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) { - mlog(0, "dentry found: %.*s\n", - dentry->d_name.len, dentry->d_name.name); - - dget_locked(dentry); - break; - } - - dentry = NULL; - } - - spin_unlock(&dcache_lock); - - if (dentry) { - d_delete(dentry); - dput(dentry); - } - - /* rename votes don't send link counts */ - if (!rename) { - mlog(0, "new_nlink = %u\n", new_nlink); - - /* We don't have the proper locks here to directly - * change i_nlink and besides, the vote is sent - * *before* the operation so it may have failed on the - * other node. This passes a hint to ocfs2_drop_inode - * to force ocfs2_delete_inode, who will take the - * proper cluster locks to sort things out. */ - if (new_nlink == 0) { - spin_lock(&oi->ip_lock); - oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; - spin_unlock(&OCFS2_I(inode)->ip_lock); - } - } -} - static void ocfs2_process_vote(struct ocfs2_super *osb, struct ocfs2_vote_msg *msg) { int net_status, vote_response; int orphaned_slot = 0; - int rename = 0; - unsigned int node_num, generation, new_nlink, namelen; - u64 blkno, parent_blkno; + unsigned int node_num, generation; + u64 blkno; enum ocfs2_vote_request request; struct inode *inode = NULL; struct ocfs2_msg_hdr *hdr = &msg->v_hdr; @@ -437,18 +342,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb, vote_response = ocfs2_process_delete_request(inode, &orphaned_slot); break; - case OCFS2_VOTE_REQ_RENAME: - rename = 1; - /* fall through */ - case OCFS2_VOTE_REQ_UNLINK: - parent_blkno = be64_to_cpu(msg->v_unlink_parent); - namelen = be32_to_cpu(msg->v_unlink_namelen); - /* new_nlink will be ignored in case of a rename vote */ - new_nlink = be32_to_cpu(msg->md1.v_nlink); - ocfs2_process_dentry_request(inode, rename, new_nlink, - parent_blkno, namelen, - msg->v_unlink_dirent); - break; default: mlog(ML_ERROR, "node %u, invalid request: %u\n", node_num, request); @@ -889,75 +782,6 @@ int ocfs2_request_delete_vote(struct inode *inode) return status; } -static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request, - struct dentry *dentry) -{ - struct inode *parent = dentry->d_parent->d_inode; - - /* We need some values which will uniquely identify a dentry - * on the other nodes so that they can find it and run - * d_delete against it. Parent directory block and full name - * should suffice. */ - - mlog(0, "unlink/rename request: parent: %llu name: %.*s\n", - (unsigned long long)OCFS2_I(parent)->ip_blkno, dentry->d_name.len, - dentry->d_name.name); - - request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno); - request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len); - memcpy(request->v_unlink_dirent, dentry->d_name.name, - dentry->d_name.len); -} - -int ocfs2_request_unlink_vote(struct inode *inode, - struct dentry *dentry, - unsigned int nlink) -{ - int status; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_vote_msg *request; - - if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN) - return -ENAMETOOLONG; - - status = -ENOMEM; - request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, - inode->i_generation, - OCFS2_VOTE_REQ_UNLINK, nlink); - if (request) { - ocfs2_setup_unlink_vote(request, dentry); - - status = ocfs2_request_vote(inode, request, NULL); - - kfree(request); - } - return status; -} - -int ocfs2_request_rename_vote(struct inode *inode, - struct dentry *dentry) -{ - int status; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_vote_msg *request; - - if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN) - return -ENAMETOOLONG; - - status = -ENOMEM; - request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, - inode->i_generation, - OCFS2_VOTE_REQ_RENAME, 0); - if (request) { - ocfs2_setup_unlink_vote(request, dentry); - - status = ocfs2_request_vote(inode, request, NULL); - - kfree(request); - } - return status; -} - int ocfs2_request_mount_vote(struct ocfs2_super *osb) { int status; diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h index 9cce60703466..53ebc1c69e56 100644 --- a/fs/ocfs2/vote.h +++ b/fs/ocfs2/vote.h @@ -39,11 +39,6 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb) } int ocfs2_request_delete_vote(struct inode *inode); -int ocfs2_request_unlink_vote(struct inode *inode, - struct dentry *dentry, - unsigned int nlink); -int ocfs2_request_rename_vote(struct inode *inode, - struct dentry *dentry); int ocfs2_request_mount_vote(struct ocfs2_super *osb); int ocfs2_request_umount_vote(struct ocfs2_super *osb); int ocfs2_register_net_handlers(struct ocfs2_super *osb); diff --git a/fs/open.c b/fs/open.c index 303f06d2a7b9..304c1c7814cb 100644 --- a/fs/open.c +++ b/fs/open.c @@ -546,7 +546,8 @@ asmlinkage long sys_chdir(const char __user * filename) struct nameidata nd; int error; - error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + error = __user_walk(filename, + LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_CHDIR, &nd); if (error) goto out; @@ -1172,6 +1173,7 @@ asmlinkage long sys_close(unsigned int fd) struct file * filp; struct files_struct *files = current->files; struct fdtable *fdt; + int retval; spin_lock(&files->file_lock); fdt = files_fdtable(files); @@ -1184,7 +1186,16 @@ asmlinkage long sys_close(unsigned int fd) FD_CLR(fd, fdt->close_on_exec); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); - return filp_close(filp, files); + retval = filp_close(filp, files); + + /* can't restart close syscall because file table entry was cleared */ + if (unlikely(retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK)) + retval = -EINTR; + + return retval; out_unlock: spin_unlock(&files->file_lock); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 93a56bd4a2b7..592a6402e851 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -8,10 +8,10 @@ #include <linux/types.h> #include <linux/string.h> #include <linux/fs.h> -#include <linux/openprom_fs.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/seq_file.h> +#include <linux/magic.h> #include <asm/openprom.h> #include <asm/oplib.h> diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 63730282ad81..1bea610078b3 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -238,10 +238,9 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) le32_to_cpu(gpt->sizeof_partition_entry); if (!count) return NULL; - pte = kmalloc(count, GFP_KERNEL); + pte = kzalloc(count, GFP_KERNEL); if (!pte) return NULL; - memset(pte, 0, count); if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba), (u8 *) pte, @@ -269,10 +268,9 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba) if (!bdev) return NULL; - gpt = kmalloc(sizeof (gpt_header), GFP_KERNEL); + gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL); if (!gpt) return NULL; - memset(gpt, 0, sizeof (gpt_header)); if (read_lba(bdev, lba, (u8 *) gpt, sizeof (gpt_header)) < sizeof (gpt_header)) { @@ -526,9 +524,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) lastlba = last_lba(bdev); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ - legacymbr = kmalloc(sizeof (*legacymbr), GFP_KERNEL); + legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); if (legacymbr) { - memset(legacymbr, 0, sizeof (*legacymbr)); read_lba(bdev, 0, (u8 *) legacymbr, sizeof (*legacymbr)); good_pmbr = is_pmbr_valid(legacymbr, lastlba); diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c index 8f12587c3129..4f8df71e49d3 100644 --- a/fs/partitions/msdos.c +++ b/fs/partitions/msdos.c @@ -58,6 +58,31 @@ msdos_magic_present(unsigned char *p) return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2); } +/* Value is EBCDIC 'IBMA' */ +#define AIX_LABEL_MAGIC1 0xC9 +#define AIX_LABEL_MAGIC2 0xC2 +#define AIX_LABEL_MAGIC3 0xD4 +#define AIX_LABEL_MAGIC4 0xC1 +static int aix_magic_present(unsigned char *p, struct block_device *bdev) +{ + Sector sect; + unsigned char *d; + int ret = 0; + + if (p[0] != AIX_LABEL_MAGIC1 && + p[1] != AIX_LABEL_MAGIC2 && + p[2] != AIX_LABEL_MAGIC3 && + p[3] != AIX_LABEL_MAGIC4) + return 0; + d = read_dev_sector(bdev, 7, §); + if (d) { + if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') + ret = 1; + put_dev_sector(sect); + }; + return ret; +} + /* * Create devices for each logical partition in an extended partition. * The logical partitions form a linked list, with each entry being @@ -393,6 +418,12 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) return 0; } + if (aix_magic_present(data, bdev)) { + put_dev_sector(sect); + printk( " [AIX]"); + return 0; + } + /* * Now that the 55aa signature is present, this is probably * either the boot sector of a FAT filesystem or a DOS-type diff --git a/fs/pipe.c b/fs/pipe.c index 20352573e025..f3b6f71e9d0b 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -879,7 +879,6 @@ static struct inode * get_pipe_inode(void) inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_blksize = PAGE_SIZE; return inode; diff --git a/fs/proc/array.c b/fs/proc/array.c index 0b615d62a159..c0e554971df0 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -347,6 +347,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) sigemptyset(&sigign); sigemptyset(&sigcatch); cutime = cstime = utime = stime = cputime_zero; + + mutex_lock(&tty_mutex); read_lock(&tasklist_lock); if (task->sighand) { spin_lock_irq(&task->sighand->siglock); @@ -388,6 +390,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) } ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0; read_unlock(&tasklist_lock); + mutex_unlock(&tty_mutex); if (!whole || num_threads<2) wchan = get_wchan(task); diff --git a/fs/proc/base.c b/fs/proc/base.c index fe8d55fb17cc..89c20d9d50bf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -797,7 +797,7 @@ out_no_task: static ssize_t mem_write(struct file * file, const char * buf, size_t count, loff_t *ppos) { - int copied = 0; + int copied; char *page; struct task_struct *task = get_proc_task(file->f_dentry->d_inode); unsigned long dst = *ppos; @@ -814,6 +814,7 @@ static ssize_t mem_write(struct file * file, const char * buf, if (!page) goto out; + copied = 0; while (count > 0) { int this_len, retval; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 146a434ba944..987c773dbb20 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -28,6 +28,7 @@ do { \ (vmi)->largest_chunk = 0; \ } while(0) +extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); #endif extern void create_seq_entry(char *name, mode_t mode, const struct file_operations *f); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 6a984f64edd7..1294eda4acae 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -100,7 +100,7 @@ static int notesize(struct memelfnote *en) int sz; sz = sizeof(struct elf_note); - sz += roundup(strlen(en->name), 4); + sz += roundup((strlen(en->name) + 1), 4); sz += roundup(en->datasz, 4); return sz; @@ -116,7 +116,7 @@ static char *storenote(struct memelfnote *men, char *bufp) #define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0) - en.n_namesz = strlen(men->name); + en.n_namesz = strlen(men->name) + 1; en.n_descsz = men->datasz; en.n_type = men->type; @@ -279,12 +279,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) tsz = elf_buflen - *fpos; if (buflen < tsz) tsz = buflen; - elf_buf = kmalloc(elf_buflen, GFP_ATOMIC); + elf_buf = kzalloc(elf_buflen, GFP_ATOMIC); if (!elf_buf) { read_unlock(&kclist_lock); return -ENOMEM; } - memset(elf_buf, 0, elf_buflen); elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen); read_unlock(&kclist_lock); if (copy_to_user(buffer, elf_buf + *fpos, tsz)) { @@ -330,10 +329,9 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) unsigned long curstart = start; unsigned long cursize = tsz; - elf_buf = kmalloc(tsz, GFP_KERNEL); + elf_buf = kzalloc(tsz, GFP_KERNEL); if (!elf_buf) return -ENOMEM; - memset(elf_buf, 0, tsz); read_lock(&vmlist_lock); for (m=vmlist; m && cursize; m=m->next) { diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index cff10ab1af63..d7dbdf9e0f49 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -33,19 +33,15 @@ #include "internal.h" /* - * display a list of all the VMAs the kernel knows about - * - nommu kernals have a single flat list + * display a single VMA to a sequenced file */ -static int nommu_vma_list_show(struct seq_file *m, void *v) +int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) { - struct vm_area_struct *vma; unsigned long ino = 0; struct file *file; dev_t dev = 0; int flags, len; - vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb); - flags = vma->vm_flags; file = vma->vm_file; @@ -78,6 +74,18 @@ static int nommu_vma_list_show(struct seq_file *m, void *v) return 0; } +/* + * display a list of all the VMAs the kernel knows about + * - nommu kernals have a single flat list + */ +static int nommu_vma_list_show(struct seq_file *m, void *v) +{ + struct vm_area_struct *vma; + + vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb); + return nommu_vma_show(m, vma); +} + static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos) { struct rb_node *_rb; diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 942156225447..5bbd60896050 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -157,10 +157,12 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "SwapCached: %8lu kB\n" "Active: %8lu kB\n" "Inactive: %8lu kB\n" +#ifdef CONFIG_HIGHMEM "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" "LowTotal: %8lu kB\n" "LowFree: %8lu kB\n" +#endif "SwapTotal: %8lu kB\n" "SwapFree: %8lu kB\n" "Dirty: %8lu kB\n" @@ -168,6 +170,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "AnonPages: %8lu kB\n" "Mapped: %8lu kB\n" "Slab: %8lu kB\n" + "SReclaimable: %8lu kB\n" + "SUnreclaim: %8lu kB\n" "PageTables: %8lu kB\n" "NFS_Unstable: %8lu kB\n" "Bounce: %8lu kB\n" @@ -183,17 +187,22 @@ static int meminfo_read_proc(char *page, char **start, off_t off, K(total_swapcache_pages), K(active), K(inactive), +#ifdef CONFIG_HIGHMEM K(i.totalhigh), K(i.freehigh), K(i.totalram-i.totalhigh), K(i.freeram-i.freehigh), +#endif K(i.totalswap), K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), K(global_page_state(NR_WRITEBACK)), K(global_page_state(NR_ANON_PAGES)), K(global_page_state(NR_FILE_MAPPED)), - K(global_page_state(NR_SLAB)), + K(global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE)), + K(global_page_state(NR_SLAB_RECLAIMABLE)), + K(global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_PAGETABLE)), K(global_page_state(NR_UNSTABLE_NFS)), K(global_page_state(NR_BOUNCE)), diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 0a163a4f7764..6b769afac55a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -122,11 +122,6 @@ struct mem_size_stats unsigned long private_dirty; }; -__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) -{ - return NULL; -} - static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) { struct proc_maps_private *priv = m->private; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 4616ed50ffcd..091aa8e48e02 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -138,25 +138,63 @@ out: } /* - * Albert D. Cahalan suggested to fake entries for the traditional - * sections here. This might be worth investigating. + * display mapping lines for a particular process's /proc/pid/maps */ -static int show_map(struct seq_file *m, void *v) +static int show_map(struct seq_file *m, void *_vml) { - return 0; + struct vm_list_struct *vml = _vml; + return nommu_vma_show(m, vml->vma); } + static void *m_start(struct seq_file *m, loff_t *pos) { + struct proc_maps_private *priv = m->private; + struct vm_list_struct *vml; + struct mm_struct *mm; + loff_t n = *pos; + + /* pin the task and mm whilst we play with them */ + priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + if (!priv->task) + return NULL; + + mm = get_task_mm(priv->task); + if (!mm) { + put_task_struct(priv->task); + priv->task = NULL; + return NULL; + } + + down_read(&mm->mmap_sem); + + /* start from the Nth VMA */ + for (vml = mm->context.vmlist; vml; vml = vml->next) + if (n-- == 0) + return vml; return NULL; } -static void m_stop(struct seq_file *m, void *v) + +static void m_stop(struct seq_file *m, void *_vml) { + struct proc_maps_private *priv = m->private; + + if (priv->task) { + struct mm_struct *mm = priv->task->mm; + up_read(&mm->mmap_sem); + mmput(mm); + put_task_struct(priv->task); + } } -static void *m_next(struct seq_file *m, void *v, loff_t *pos) + +static void *m_next(struct seq_file *m, void *_vml, loff_t *pos) { - return NULL; + struct vm_list_struct *vml = _vml; + + (*pos)++; + return vml ? vml->next : NULL; } -static struct seq_operations proc_pid_maps_op = { + +static struct seq_operations proc_pid_maps_ops = { .start = m_start, .next = m_next, .stop = m_stop, @@ -165,11 +203,19 @@ static struct seq_operations proc_pid_maps_op = { static int maps_open(struct inode *inode, struct file *file) { - int ret; - ret = seq_open(file, &proc_pid_maps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = NULL; + struct proc_maps_private *priv; + int ret = -ENOMEM; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->pid = proc_pid(inode); + ret = seq_open(file, &proc_pid_maps_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } } return ret; } @@ -178,6 +224,6 @@ struct file_operations proc_maps_operations = { .open = maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 5a903491e697..5a41db2a218d 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -358,11 +358,10 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) const char *errmsg; struct qnx4_sb_info *qs; - qs = kmalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL); + qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL); if (!qs) return -ENOMEM; s->s_fs_info = qs; - memset(qs, 0, sizeof(struct qnx4_sb_info)); sb_set_blocksize(s, QNX4_BLOCK_SIZE); @@ -497,7 +496,6 @@ static void qnx4_read_inode(struct inode *inode) inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->di_ctime); inode->i_ctime.tv_nsec = 0; inode->i_blocks = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size); - inode->i_blksize = QNX4_DIR_ENTRY_SIZE; memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE); if (S_ISREG(inode->i_mode)) { @@ -557,9 +555,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(qnx4_inode_cachep)) - printk(KERN_INFO - "qnx4_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(qnx4_inode_cachep); } static int qnx4_get_sb(struct file_system_type *fs_type, diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index b9677335cc8d..bc0e51662424 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -58,7 +58,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_mode = mode; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile index 3a59309f3ca9..0eb7ac080484 100644 --- a/fs/reiserfs/Makefile +++ b/fs/reiserfs/Makefile @@ -28,7 +28,7 @@ endif # will work around it. If any other architecture displays this behavior, # add it here. ifeq ($(CONFIG_PPC32),y) -EXTRA_CFLAGS := -O1 +EXTRA_CFLAGS := $(call cc-ifversion, -lt, 0400, -O1) endif TAGS: diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 1627edd50810..1cfbe857ba27 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -130,7 +130,7 @@ static int reiserfs_sync_file(struct file *p_s_filp, reiserfs_write_lock(p_s_inode->i_sb); barrier_done = reiserfs_commit_for_inode(p_s_inode); reiserfs_write_unlock(p_s_inode->i_sb); - if (barrier_done != 1) + if (barrier_done != 1 && reiserfs_barrier_flush(p_s_inode->i_sb)) blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL); if (barrier_done < 0) return barrier_done; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 52f1e2136546..7e5a2f5ebeb0 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -17,8 +17,6 @@ #include <linux/writeback.h> #include <linux/quotaops.h> -extern int reiserfs_default_io_size; /* default io size devuned in super.c */ - static int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); static int reiserfs_prepare_write(struct file *f, struct page *page, @@ -1122,7 +1120,6 @@ static void init_inode(struct inode *inode, struct path *path) ih = PATH_PITEM_HEAD(path); copy_key(INODE_PKEY(inode), &(ih->ih_key)); - inode->i_blksize = reiserfs_default_io_size; INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list)); REISERFS_I(inode)->i_flags = 0; @@ -1130,9 +1127,9 @@ static void init_inode(struct inode *inode, struct path *path) REISERFS_I(inode)->i_prealloc_count = 0; REISERFS_I(inode)->i_trans_id = 0; REISERFS_I(inode)->i_jl = NULL; - REISERFS_I(inode)->i_acl_access = NULL; - REISERFS_I(inode)->i_acl_default = NULL; - init_rwsem(&REISERFS_I(inode)->xattr_sem); + reiserfs_init_acl_access(inode); + reiserfs_init_acl_default(inode); + reiserfs_init_xattr_rwsem(inode); if (stat_data_v1(ih)) { struct stat_data_v1 *sd = @@ -1837,9 +1834,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, REISERFS_I(inode)->i_attrs = REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode); - REISERFS_I(inode)->i_acl_access = NULL; - REISERFS_I(inode)->i_acl_default = NULL; - init_rwsem(&REISERFS_I(inode)->xattr_sem); + reiserfs_init_acl_access(inode); + reiserfs_init_acl_default(inode); + reiserfs_init_xattr_rwsem(inode); if (old_format_only(sb)) make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, @@ -1877,7 +1874,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, } // these do not go to on-disk stat data inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); - inode->i_blksize = reiserfs_default_io_size; // store in in-core inode the key of stat data and version all // object items will have (directory items will have old offset @@ -1978,11 +1974,13 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, * iput doesn't deadlock in reiserfs_delete_xattrs. The locking * code really needs to be reworked, but this will take care of it * for now. -jeffm */ +#ifdef CONFIG_REISERFS_FS_POSIX_ACL if (REISERFS_I(dir)->i_acl_default && !IS_ERR(REISERFS_I(dir)->i_acl_default)) { reiserfs_write_unlock_xattrs(dir->i_sb); iput(inode); reiserfs_write_lock_xattrs(dir->i_sb); } else +#endif iput(inode); return err; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 9b3672d69367..e6b5ccf23f15 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1186,6 +1186,21 @@ static struct reiserfs_journal_list *find_newer_jl_for_cn(struct return NULL; } +static int newer_jl_done(struct reiserfs_journal_cnode *cn) +{ + struct super_block *sb = cn->sb; + b_blocknr_t blocknr = cn->blocknr; + + cn = cn->hprev; + while (cn) { + if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist && + atomic_read(&cn->jlist->j_commit_left) != 0) + return 0; + cn = cn->hprev; + } + return 1; +} + static void remove_journal_hash(struct super_block *, struct reiserfs_journal_cnode **, struct reiserfs_journal_list *, unsigned long, @@ -1604,6 +1619,31 @@ static int flush_journal_list(struct super_block *s, return err; } +static int test_transaction(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal_cnode *cn; + + if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) + return 1; + + cn = jl->j_realblock; + while (cn) { + /* if the blocknr == 0, this has been cleared from the hash, + ** skip it + */ + if (cn->blocknr == 0) { + goto next; + } + if (cn->bh && !newer_jl_done(cn)) + return 0; + next: + cn = cn->next; + cond_resched(); + } + return 0; +} + static int write_one_transaction(struct super_block *s, struct reiserfs_journal_list *jl, struct buffer_chunk *chunk) @@ -3433,16 +3473,6 @@ static void flush_async_commits(void *p) flush_commit_list(p_s_sb, jl, 1); } unlock_kernel(); - /* - * this is a little racey, but there's no harm in missing - * the filemap_fdata_write - */ - if (!atomic_read(&journal->j_async_throttle) - && !reiserfs_is_journal_aborted(journal)) { - atomic_inc(&journal->j_async_throttle); - filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping); - atomic_dec(&journal->j_async_throttle); - } } /* @@ -3844,7 +3874,9 @@ static void flush_old_journal_lists(struct super_block *s) entry = journal->j_journal_list.next; jl = JOURNAL_LIST_ENTRY(entry); /* this check should always be run, to send old lists to disk */ - if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) { + if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4)) && + atomic_read(&jl->j_commit_left) == 0 && + test_transaction(s, jl)) { flush_used_journal_lists(s, jl); } else { break; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 5567328f1041..80fc3b32802f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -510,8 +510,10 @@ static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags) SLAB_CTOR_CONSTRUCTOR) { INIT_LIST_HEAD(&ei->i_prealloc_list); inode_init_once(&ei->vfs_inode); +#ifdef CONFIG_REISERFS_FS_POSIX_ACL ei->i_acl_access = NULL; ei->i_acl_default = NULL; +#endif } } @@ -530,9 +532,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(reiserfs_inode_cachep)) - reiserfs_warning(NULL, - "reiserfs_inode_cache: not all structures were freed"); + kmem_cache_destroy(reiserfs_inode_cachep); } /* we don't mark inodes dirty, we just log them */ @@ -562,6 +562,7 @@ static void reiserfs_dirty_inode(struct inode *inode) reiserfs_write_unlock(inode->i_sb); } +#ifdef CONFIG_REISERFS_FS_POSIX_ACL static void reiserfs_clear_inode(struct inode *inode) { struct posix_acl *acl; @@ -576,6 +577,9 @@ static void reiserfs_clear_inode(struct inode *inode) posix_acl_release(acl); REISERFS_I(inode)->i_acl_default = NULL; } +#else +#define reiserfs_clear_inode NULL +#endif #ifdef CONFIG_QUOTA static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, @@ -725,12 +729,6 @@ static const arg_desc_t error_actions[] = { {NULL, 0, 0}, }; -int reiserfs_default_io_size = 128 * 1024; /* Default recommended I/O size is 128k. - There might be broken applications that are - confused by this. Use nolargeio mount option - to get usual i/o size = PAGE_SIZE. - */ - /* proceed only one option from a list *cur - string containing of mount options opts - array of options which are accepted opt_arg - if option is found and requires an argument and if it is specifed @@ -959,19 +957,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin } if (c == 'w') { - char *p = NULL; - int val = simple_strtoul(arg, &p, 0); - - if (*p != '\0') { - reiserfs_warning(s, - "reiserfs_parse_options: non-numeric value %s for nolargeio option", - arg); - return 0; - } - if (val) - reiserfs_default_io_size = PAGE_SIZE; - else - reiserfs_default_io_size = 128 * 1024; + reiserfs_warning(s, "reiserfs: nolargeio option is no longer supported"); + return 0; } if (c == 'j') { diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c index 22eed61ebf69..ddcd9e1ef282 100644 --- a/fs/romfs/inode.c +++ b/fs/romfs/inode.c @@ -589,8 +589,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(romfs_inode_cachep)) - printk(KERN_INFO "romfs_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(romfs_inode_cachep); } static int romfs_remount(struct super_block *sb, int *flags, char *data) diff --git a/fs/select.c b/fs/select.c index 33b72ba0f86f..dcbc1112b7ec 100644 --- a/fs/select.c +++ b/fs/select.c @@ -658,8 +658,6 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout) unsigned int i; struct poll_list *head; struct poll_list *walk; - struct fdtable *fdt; - int max_fdset; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ @@ -667,11 +665,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout) struct poll_list *stack_pp = NULL; /* Do a sanity check on nfds ... */ - rcu_read_lock(); - fdt = files_fdtable(current->files); - max_fdset = fdt->max_fdset; - rcu_read_unlock(); - if (nfds > max_fdset && nfds > OPEN_MAX) + if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur) return -EINVAL; poll_initwait(&table); diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index a1ed657c3c84..2c122ee83adb 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -89,8 +89,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(smb_inode_cachep)) - printk(KERN_INFO "smb_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(smb_inode_cachep); } static int smb_remount(struct super_block *sb, int *flags, char *data) @@ -167,7 +166,6 @@ smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr) fattr->f_mtime = inode->i_mtime; fattr->f_ctime = inode->i_ctime; fattr->f_atime = inode->i_atime; - fattr->f_blksize= inode->i_blksize; fattr->f_blocks = inode->i_blocks; fattr->attr = SMB_I(inode)->attr; @@ -201,7 +199,6 @@ smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr) inode->i_uid = fattr->f_uid; inode->i_gid = fattr->f_gid; inode->i_ctime = fattr->f_ctime; - inode->i_blksize= fattr->f_blksize; inode->i_blocks = fattr->f_blocks; inode->i_size = fattr->f_size; inode->i_mtime = fattr->f_mtime; diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c index c3495059889d..40e174db9872 100644 --- a/fs/smbfs/proc.c +++ b/fs/smbfs/proc.c @@ -1826,7 +1826,6 @@ smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr) fattr->f_nlink = 1; fattr->f_uid = server->mnt->uid; fattr->f_gid = server->mnt->gid; - fattr->f_blksize = SMB_ST_BLKSIZE; fattr->f_unix = 0; } diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c index c8e96195b96e..0fb74697abc4 100644 --- a/fs/smbfs/request.c +++ b/fs/smbfs/request.c @@ -49,8 +49,7 @@ int smb_init_request_cache(void) void smb_destroy_request_cache(void) { - if (kmem_cache_destroy(req_cachep)) - printk(KERN_INFO "smb_destroy_request_cache: not all structures were freed\n"); + kmem_cache_destroy(req_cachep); } /* diff --git a/fs/stat.c b/fs/stat.c index 3a44dcf97da2..60a31d5e5966 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -14,6 +14,7 @@ #include <linux/namei.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/pagemap.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -32,7 +33,7 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) stat->ctime = inode->i_ctime; stat->size = i_size_read(inode); stat->blocks = inode->i_blocks; - stat->blksize = inode->i_blksize; + stat->blksize = (1 << inode->i_blkbits); } EXPORT_SYMBOL(generic_fillattr); diff --git a/fs/super.c b/fs/super.c index 5c4c94d5495e..6987824d0dce 100644 --- a/fs/super.c +++ b/fs/super.c @@ -199,7 +199,7 @@ EXPORT_SYMBOL(deactivate_super); * success, 0 if we had failed (superblock contents was already dead or * dying when grab_super() had been called). */ -static int grab_super(struct super_block *s) +static int grab_super(struct super_block *s) __releases(sb_lock) { s->s_count++; spin_unlock(&sb_lock); diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index c16a93c353c0..98022e41cda1 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -10,6 +10,7 @@ #include <linux/errno.h> #include <linux/fs.h> +#include <linux/kernel.h> #include <linux/kobject.h> #include <linux/module.h> #include <linux/slab.h> @@ -176,7 +177,6 @@ const struct file_operations bin_fops = { * sysfs_create_bin_file - create binary file for object. * @kobj: object. * @attr: attribute descriptor. - * */ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) @@ -191,13 +191,16 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) * sysfs_remove_bin_file - remove binary file for object. * @kobj: object. * @attr: attribute descriptor. - * */ -int sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) +void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) { - sysfs_hash_and_remove(kobj->dentry,attr->attr.name); - return 0; + if (sysfs_hash_and_remove(kobj->dentry, attr->attr.name) < 0) { + printk(KERN_ERR "%s: " + "bad dentry or inode or no such file: \"%s\"\n", + __FUNCTION__, attr->attr.name); + dump_stack(); + } } EXPORT_SYMBOL_GPL(sysfs_create_bin_file); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 61c42430cba3..5f3d725d1125 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -43,7 +43,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, memset(sd, 0, sizeof(*sd)); atomic_set(&sd->s_count, 1); - atomic_set(&sd->s_event, 0); + atomic_set(&sd->s_event, 1); INIT_LIST_HEAD(&sd->s_children); list_add(&sd->s_sibling, &parent_sd->s_children); sd->s_element = element; diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 9889e54e1f13..e79e38d52c00 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -12,6 +12,7 @@ #include <linux/namei.h> #include <linux/backing-dev.h> #include <linux/capability.h> +#include <linux/errno.h> #include "sysfs.h" extern struct super_block * sysfs_sb; @@ -124,7 +125,6 @@ struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd) { struct inode * inode = new_inode(sysfs_sb); if (inode) { - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_mapping->a_ops = &sysfs_aops; inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; @@ -234,17 +234,18 @@ void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent) } } -void sysfs_hash_and_remove(struct dentry * dir, const char * name) +int sysfs_hash_and_remove(struct dentry * dir, const char * name) { struct sysfs_dirent * sd; struct sysfs_dirent * parent_sd; + int found = 0; if (!dir) - return; + return -ENOENT; if (dir->d_inode == NULL) /* no inode means this hasn't been made visible yet */ - return; + return -ENOENT; parent_sd = dir->d_fsdata; mutex_lock(&dir->d_inode->i_mutex); @@ -255,8 +256,11 @@ void sysfs_hash_and_remove(struct dentry * dir, const char * name) list_del_init(&sd->s_sibling); sysfs_drop_dentry(sd, dir); sysfs_put(sd); + found = 1; break; } } mutex_unlock(&dir->d_inode->i_mutex); + + return found ? 0 : -ENOENT; } diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index d2eac3ceed5f..f50e3cc2ded8 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -3,6 +3,7 @@ */ #include <linux/fs.h> +#include <linux/mount.h> #include <linux/module.h> #include <linux/kobject.h> #include <linux/namei.h> @@ -82,10 +83,19 @@ exit1: */ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name) { - struct dentry * dentry = kobj->dentry; + struct dentry *dentry = NULL; int error = -EEXIST; - BUG_ON(!kobj || !kobj->dentry || !name); + BUG_ON(!name); + + if (!kobj) { + if (sysfs_mount && sysfs_mount->mnt_sb) + dentry = sysfs_mount->mnt_sb->s_root; + } else + dentry = kobj->dentry; + + if (!dentry) + return -EFAULT; mutex_lock(&dentry->d_inode->i_mutex); if (!sysfs_dirent_exist(dentry->d_fsdata, name)) diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3651ffb5ec09..6f3d6bd52887 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -10,7 +10,7 @@ extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *, umode_t, int); extern int sysfs_add_file(struct dentry *, const struct attribute *, int); -extern void sysfs_hash_and_remove(struct dentry * dir, const char * name); +extern int sysfs_hash_and_remove(struct dentry * dir, const char * name); extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name); extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **); diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 9b585d1081c0..115ab0d6f4bc 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -170,7 +170,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode) inode->i_uid = current->fsuid; inode->i_ino = fs16_to_cpu(sbi, ino); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; - inode->i_blocks = inode->i_blksize = 0; + inode->i_blocks = 0; memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data)); SYSV_I(inode)->i_dir_start_lookup = 0; insert_inode_hash(inode); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 58b2d22142ba..d63c5e48b050 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -201,7 +201,7 @@ static void sysv_read_inode(struct inode *inode) inode->i_ctime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; - inode->i_blocks = inode->i_blksize = 0; + inode->i_blocks = 0; si = SYSV_I(inode); for (block = 0; block < 10+1+1+1; block++) diff --git a/fs/sysv/super.c b/fs/sysv/super.c index 876639b93321..350cba5d6803 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -369,10 +369,9 @@ static int sysv_fill_super(struct super_block *sb, void *data, int silent) if (64 != sizeof (struct sysv_inode)) panic("sysv fs: bad inode size"); - sbi = kmalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; - memset(sbi, 0, sizeof(struct sysv_sb_info)); sbi->s_sb = sb; sbi->s_block_base = 0; @@ -453,10 +452,9 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent) if (64 != sizeof (struct sysv_inode)) panic("sysv fs: bad i-node size"); - sbi = kmalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; - memset(sbi, 0, sizeof(struct sysv_sb_info)); sbi->s_sb = sb; sbi->s_block_base = 0; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 33323473e3c4..8206983f2ebf 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -121,7 +121,6 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err) UDF_I_LOCATION(inode).logicalBlockNum = block; UDF_I_LOCATION(inode).partitionReferenceNum = UDF_I_LOCATION(dir).partitionReferenceNum; inode->i_ino = udf_get_lb_pblock(sb, UDF_I_LOCATION(inode), 0); - inode->i_blksize = PAGE_SIZE; inode->i_blocks = 0; UDF_I_LENEATTR(inode) = 0; UDF_I_LENALLOC(inode) = 0; @@ -130,14 +129,12 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err) { UDF_I_EFE(inode) = 1; UDF_UPDATE_UDFREV(inode->i_sb, UDF_VERS_USE_EXTENDED_FE); - UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry), GFP_KERNEL); - memset(UDF_I_DATA(inode), 0x00, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)); + UDF_I_DATA(inode) = kzalloc(inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry), GFP_KERNEL); } else { UDF_I_EFE(inode) = 0; - UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL); - memset(UDF_I_DATA(inode), 0x00, inode->i_sb->s_blocksize - sizeof(struct fileEntry)); + UDF_I_DATA(inode) = kzalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL); } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_IN_ICB; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 605f5111b6d8..b223b32db991 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -916,8 +916,6 @@ __udf_read_inode(struct inode *inode) * i_nlink = 1 * i_op = NULL; */ - inode->i_blksize = PAGE_SIZE; - bh = udf_read_ptagged(inode->i_sb, UDF_I_LOCATION(inode), 0, &ident); if (!bh) diff --git a/fs/udf/super.c b/fs/udf/super.c index fcce1a21a51b..1d3b5d2070e5 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -156,8 +156,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(udf_inode_cachep)) - printk(KERN_INFO "udf_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(udf_inode_cachep); } /* Superblock operations */ @@ -1622,6 +1621,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) goto error_out; } + if (UDF_SB_PARTFLAGS(sb, UDF_SB_PARTITION(sb)) & UDF_PART_FLAG_READ_ONLY) + printk("UDF-fs: Partition marked readonly; forcing readonly mount\n"); + sb->s_flags |= MS_RDONLY; + if ( udf_find_fileset(sb, &fileset, &rootdir) ) { printk("UDF-fs: No fileset found\n"); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 9501dcd3b213..2ad1259c6eca 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -255,7 +255,6 @@ cg_found: inode->i_gid = current->fsgid; inode->i_ino = cg * uspi->s_ipg + bit; - inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; ufsi->i_flags = UFS_I(dir)->i_flags; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 30c6e8a9446c..ee1eaa6f4ec2 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -741,7 +741,6 @@ void ufs_read_inode(struct inode * inode) ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); } - inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/ inode->i_version++; ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 992ee0b87cc3..ec79e3091d1b 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -611,11 +611,10 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) UFSD("ENTER\n"); - sbi = kmalloc(sizeof(struct ufs_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL); if (!sbi) goto failed_nomem; sb->s_fs_info = sbi; - memset(sbi, 0, sizeof(struct ufs_sb_info)); UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); @@ -1245,8 +1244,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(ufs_inode_cachep)) - printk(KERN_INFO "ufs_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(ufs_inode_cachep); } #ifdef CONFIG_QUOTA diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6 index 9e7f85986d0d..291948d5085a 100644 --- a/fs/xfs/Makefile-linux-2.6 +++ b/fs/xfs/Makefile-linux-2.6 @@ -30,7 +30,6 @@ ifeq ($(CONFIG_XFS_TRACE),y) EXTRA_CFLAGS += -DXFS_BLI_TRACE EXTRA_CFLAGS += -DXFS_BMAP_TRACE EXTRA_CFLAGS += -DXFS_BMBT_TRACE - EXTRA_CFLAGS += -DXFS_DIR_TRACE EXTRA_CFLAGS += -DXFS_DIR2_TRACE EXTRA_CFLAGS += -DXFS_DQUOT_TRACE EXTRA_CFLAGS += -DXFS_ILOCK_TRACE diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index aba7fcf881a2..d59737589815 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -34,6 +34,14 @@ kmem_alloc(size_t size, unsigned int __nocast flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; +#ifdef DEBUG + if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) { + printk(KERN_WARNING "Large %s attempt, size=%ld\n", + __FUNCTION__, (long)size); + dump_stack(); + } +#endif + do { if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) ptr = kmalloc(size, lflags); @@ -60,6 +68,27 @@ kmem_zalloc(size_t size, unsigned int __nocast flags) return ptr; } +void * +kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize, + unsigned int __nocast flags) +{ + void *ptr; + size_t kmsize = maxsize; + unsigned int kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP; + + while (!(ptr = kmem_zalloc(kmsize, kmflags))) { + if ((kmsize <= minsize) && (flags & KM_NOSLEEP)) + break; + if ((kmsize >>= 1) <= minsize) { + kmsize = minsize; + kmflags = flags; + } + } + if (ptr) + *size = kmsize; + return ptr; +} + void kmem_free(void *ptr, size_t size) { diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index 939bd84bc7ee..9ebabdf7829c 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -30,6 +30,7 @@ #define KM_NOSLEEP 0x0002u #define KM_NOFS 0x0004u #define KM_MAYFAIL 0x0008u +#define KM_LARGE 0x0010u /* * We use a special process flag to avoid recursive callbacks into @@ -41,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL)); + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE)); if (flags & KM_NOSLEEP) { lflags = GFP_ATOMIC | __GFP_NOWARN; @@ -54,8 +55,9 @@ kmem_flags_convert(unsigned int __nocast flags) } extern void *kmem_alloc(size_t, unsigned int __nocast); -extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast); extern void *kmem_zalloc(size_t, unsigned int __nocast); +extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast); +extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast); extern void kmem_free(void *, size_t); /* @@ -91,8 +93,8 @@ kmem_zone_free(kmem_zone_t *zone, void *ptr) static inline void kmem_zone_destroy(kmem_zone_t *zone) { - if (zone && kmem_cache_destroy(zone)) - BUG(); + if (zone) + kmem_cache_destroy(zone); } extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h index b25090094cca..2009e6d922ce 100644 --- a/fs/xfs/linux-2.6/sema.h +++ b/fs/xfs/linux-2.6/sema.h @@ -29,8 +29,6 @@ typedef struct semaphore sema_t; -#define init_sema(sp, val, c, d) sema_init(sp, val) -#define initsema(sp, val) sema_init(sp, val) #define initnsema(sp, val, name) sema_init(sp, val) #define psema(sp, b) down(sp) #define vsema(sp) up(sp) diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h index 9a8ad481b008..351a8f454bd1 100644 --- a/fs/xfs/linux-2.6/sv.h +++ b/fs/xfs/linux-2.6/sv.h @@ -53,8 +53,6 @@ static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state, remove_wait_queue(&sv->waiters, &wait); } -#define init_sv(sv,type,name,flag) \ - init_waitqueue_head(&(sv)->waiters) #define sv_init(sv,flag,name) \ init_waitqueue_head(&(sv)->waiters) #define sv_destroy(sv) \ diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 34dcb43a7837..09360cf1e1f2 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -71,7 +71,7 @@ xfs_page_trace( int tag, struct inode *inode, struct page *page, - int mask) + unsigned long pgoff) { xfs_inode_t *ip; bhv_vnode_t *vp = vn_from_inode(inode); @@ -91,7 +91,7 @@ xfs_page_trace( (void *)ip, (void *)inode, (void *)page, - (void *)((unsigned long)mask), + (void *)pgoff, (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), (void *)((unsigned long)((isize >> 32) & 0xffffffff)), @@ -105,7 +105,7 @@ xfs_page_trace( (void *)NULL); } #else -#define xfs_page_trace(tag, inode, page, mask) +#define xfs_page_trace(tag, inode, page, pgoff) #endif /* @@ -1197,7 +1197,7 @@ xfs_vm_releasepage( .nr_to_write = 1, }; - xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask); + xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0); if (!page_has_buffers(page)) return 0; @@ -1356,7 +1356,6 @@ xfs_end_io_direct( ioend->io_size = size; xfs_finish_ioend(ioend); } else { - ASSERT(size >= 0); xfs_destroy_ioend(ioend); } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 2af528dcfb04..9bbadafdcb00 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -318,8 +318,12 @@ xfs_buf_free( if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) free_address(bp->b_addr - bp->b_offset); - for (i = 0; i < bp->b_page_count; i++) - page_cache_release(bp->b_pages[i]); + for (i = 0; i < bp->b_page_count; i++) { + struct page *page = bp->b_pages[i]; + + ASSERT(!PagePrivate(page)); + page_cache_release(page); + } _xfs_buf_free_pages(bp); } else if (bp->b_flags & _XBF_KMEM_ALLOC) { /* @@ -400,6 +404,7 @@ _xfs_buf_lookup_pages( nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); size -= nbytes; + ASSERT(!PagePrivate(page)); if (!PageUptodate(page)) { page_count--; if (blocksize >= PAGE_CACHE_SIZE) { @@ -768,7 +773,7 @@ xfs_buf_get_noaddr( _xfs_buf_initialize(bp, target, 0, len, 0); try_again: - data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL); + data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL | KM_LARGE); if (unlikely(data == NULL)) goto fail_free_buf; @@ -1117,10 +1122,10 @@ xfs_buf_bio_end_io( do { struct page *page = bvec->bv_page; + ASSERT(!PagePrivate(page)); if (unlikely(bp->b_error)) { if (bp->b_flags & XBF_READ) ClearPageUptodate(page); - SetPageError(page); } else if (blocksize >= PAGE_CACHE_SIZE) { SetPageUptodate(page); } else if (!PagePrivate(page) && @@ -1156,16 +1161,16 @@ _xfs_buf_ioapply( total_nr_pages = bp->b_page_count; map_i = 0; - if (bp->b_flags & _XBF_RUN_QUEUES) { - bp->b_flags &= ~_XBF_RUN_QUEUES; - rw = (bp->b_flags & XBF_READ) ? READ_SYNC : WRITE_SYNC; - } else { - rw = (bp->b_flags & XBF_READ) ? READ : WRITE; - } - if (bp->b_flags & XBF_ORDERED) { ASSERT(!(bp->b_flags & XBF_READ)); rw = WRITE_BARRIER; + } else if (bp->b_flags & _XBF_RUN_QUEUES) { + ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); + bp->b_flags &= ~_XBF_RUN_QUEUES; + rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; + } else { + rw = (bp->b_flags & XBF_WRITE) ? WRITE : + (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; } /* Special code path for reading a sub page size buffer in -- @@ -1681,6 +1686,7 @@ xfsbufd( xfs_buf_t *bp, *n; struct list_head *dwq = &target->bt_delwrite_queue; spinlock_t *dwlk = &target->bt_delwrite_lock; + int count; current->flags |= PF_MEMALLOC; @@ -1696,6 +1702,7 @@ xfsbufd( schedule_timeout_interruptible( xfs_buf_timer_centisecs * msecs_to_jiffies(10)); + count = 0; age = xfs_buf_age_centisecs * msecs_to_jiffies(10); spin_lock(dwlk); list_for_each_entry_safe(bp, n, dwq, b_list) { @@ -1711,9 +1718,11 @@ xfsbufd( break; } - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); + bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q| + _XBF_RUN_QUEUES); bp->b_flags |= XBF_WRITE; - list_move(&bp->b_list, &tmp); + list_move_tail(&bp->b_list, &tmp); + count++; } } spin_unlock(dwlk); @@ -1724,12 +1733,12 @@ xfsbufd( list_del_init(&bp->b_list); xfs_buf_iostrategy(bp); - - blk_run_address_space(target->bt_mapping); } if (as_list_len > 0) purge_addresses(); + if (count) + blk_run_address_space(target->bt_mapping); clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); } while (!kthread_should_stop()); @@ -1767,7 +1776,7 @@ xfs_flush_buftarg( continue; } - list_move(&bp->b_list, &tmp); + list_move_tail(&bp->b_list, &tmp); } spin_unlock(dwlk); @@ -1776,7 +1785,7 @@ xfs_flush_buftarg( */ list_for_each_entry_safe(bp, n, &tmp, b_list) { xfs_buf_lock(bp); - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); + bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|_XBF_RUN_QUEUES); bp->b_flags |= XBF_WRITE; if (wait) bp->b_flags &= ~XBF_ASYNC; @@ -1786,6 +1795,9 @@ xfs_flush_buftarg( xfs_buf_iostrategy(bp); } + if (wait) + blk_run_address_space(target->bt_mapping); + /* * Remaining list items must be flushed before returning */ @@ -1797,9 +1809,6 @@ xfs_flush_buftarg( xfs_buf_relse(bp); } - if (wait) - blk_run_address_space(target->bt_mapping); - return pincount; } diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 7858703ed84c..9dd235cb0107 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -298,11 +298,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) -#define XFS_BUF_ISUNINITIAL(bp) (0) -#define XFS_BUF_UNUNINITIAL(bp) (0) - -#define XFS_BUF_BP_ISMAPPED(bp) (1) - #define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone) #define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func)) #define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL) @@ -393,8 +388,6 @@ static inline int XFS_bwrite(xfs_buf_t *bp) return error; } -#define XFS_bdwrite(bp) xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC) - static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp) { bp->b_strat = xfs_bdstrat_cb; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 3d4f6dff2113..41cfcba7ce49 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -370,7 +370,7 @@ xfs_file_readdir( /* Try fairly hard to get memory */ do { - if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL))) + if ((read_buf = kmalloc(rlen, GFP_KERNEL))) break; rlen >>= 1; } while (rlen >= 1024); diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c index 6c162c3dde7e..ed3a5e1b4b67 100644 --- a/fs/xfs/linux-2.6/xfs_globals.c +++ b/fs/xfs/linux-2.6/xfs_globals.c @@ -34,7 +34,7 @@ xfs_param_t xfs_params = { .restrict_chown = { 0, 1, 1 }, .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 127 }, + .panic_mask = { 0, 0, 255 }, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 }, diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 6e52a5dd38d8..a74f854d91e6 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -653,7 +653,7 @@ xfs_attrmulti_by_handle( STATIC int xfs_ioc_space( bhv_desc_t *bdp, - bhv_vnode_t *vp, + struct inode *inode, struct file *filp, int flags, unsigned int cmd, @@ -735,7 +735,7 @@ xfs_ioctl( !capable(CAP_SYS_ADMIN)) return -EPERM; - return xfs_ioc_space(bdp, vp, filp, ioflags, cmd, arg); + return xfs_ioc_space(bdp, inode, filp, ioflags, cmd, arg); case XFS_IOC_DIOINFO: { struct dioattr da; @@ -763,6 +763,8 @@ xfs_ioctl( return xfs_ioc_fsgeometry(mp, arg); case XFS_IOC_GETVERSION: + return put_user(inode->i_generation, (int __user *)arg); + case XFS_IOC_GETXFLAGS: case XFS_IOC_SETXFLAGS: case XFS_IOC_FSGETXATTR: @@ -957,7 +959,7 @@ xfs_ioctl( STATIC int xfs_ioc_space( bhv_desc_t *bdp, - bhv_vnode_t *vp, + struct inode *inode, struct file *filp, int ioflags, unsigned int cmd, @@ -967,13 +969,13 @@ xfs_ioc_space( int attr_flags = 0; int error; - if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND)) + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) return -XFS_ERROR(EPERM); if (!(filp->f_mode & FMODE_WRITE)) return -XFS_ERROR(EBADF); - if (!VN_ISREG(vp)) + if (!S_ISREG(inode->i_mode)) return -XFS_ERROR(EINVAL); if (copy_from_user(&bf, arg, sizeof(bf))) @@ -1264,13 +1266,6 @@ xfs_ioc_xattr( break; } - case XFS_IOC_GETVERSION: { - flags = vn_to_inode(vp)->i_generation; - if (copy_to_user(arg, &flags, sizeof(flags))) - error = -EFAULT; - break; - } - default: error = -ENOTTY; break; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index d9180020de63..3ba814ae3bba 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -553,13 +553,13 @@ xfs_vn_follow_link( ASSERT(dentry); ASSERT(nd); - link = (char *)kmalloc(MAXPATHLEN+1, GFP_KERNEL); + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); if (!link) { nd_set_link(nd, ERR_PTR(-ENOMEM)); return NULL; } - uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL); + uio = kmalloc(sizeof(uio_t), GFP_KERNEL); if (!uio) { kfree(link); nd_set_link(nd, ERR_PTR(-ENOMEM)); @@ -623,12 +623,27 @@ xfs_vn_getattr( { struct inode *inode = dentry->d_inode; bhv_vnode_t *vp = vn_from_inode(inode); - int error = 0; + bhv_vattr_t vattr = { .va_mask = XFS_AT_STAT }; + int error; - if (unlikely(vp->v_flag & VMODIFIED)) - error = vn_revalidate(vp); - if (!error) - generic_fillattr(inode, stat); + error = bhv_vop_getattr(vp, &vattr, ATTR_LAZY, NULL); + if (likely(!error)) { + stat->size = i_size_read(inode); + stat->dev = inode->i_sb->s_dev; + stat->rdev = (vattr.va_rdev == 0) ? 0 : + MKDEV(sysv_major(vattr.va_rdev) & 0x1ff, + sysv_minor(vattr.va_rdev)); + stat->mode = vattr.va_mode; + stat->nlink = vattr.va_nlink; + stat->uid = vattr.va_uid; + stat->gid = vattr.va_gid; + stat->ino = vattr.va_nodeid; + stat->atime = vattr.va_atime; + stat->mtime = vattr.va_mtime; + stat->ctime = vattr.va_ctime; + stat->blocks = vattr.va_nblocks; + stat->blksize = vattr.va_blocksize; + } return -error; } diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index a13f75c1a936..2b0e0018738a 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -148,11 +148,7 @@ BUFFER_FNS(PrivateStart, unwritten); (current->flags = ((current->flags & ~(f)) | (*(sp) & (f)))) #define NBPP PAGE_SIZE -#define DPPSHFT (PAGE_SHIFT - 9) #define NDPP (1 << (PAGE_SHIFT - 9)) -#define dtop(DD) (((DD) + NDPP - 1) >> DPPSHFT) -#define dtopt(DD) ((DD) >> DPPSHFT) -#define dpoff(DD) ((DD) & (NDPP-1)) #define NBBY 8 /* number of bits per byte */ #define NBPC PAGE_SIZE /* Number of bytes per click */ @@ -172,8 +168,6 @@ BUFFER_FNS(PrivateStart, unwritten); #define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT) #define btoc64(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) #define btoct64(x) ((__uint64_t)(x)>>BPCSHIFT) -#define io_btoc(x) (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT) -#define io_btoct(x) ((__psunsigned_t)(x)>>IO_BPCSHIFT) /* off_t bytes to clicks */ #define offtoc(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) @@ -186,7 +180,6 @@ BUFFER_FNS(PrivateStart, unwritten); #define ctob(x) ((__psunsigned_t)(x)<<BPCSHIFT) #define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT) #define ctob64(x) ((__uint64_t)(x)<<BPCSHIFT) -#define io_ctob(x) ((__psunsigned_t)(x)<<IO_BPCSHIFT) /* bytes to clicks */ #define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT) @@ -339,4 +332,11 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) return(x * y); } +static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) +{ + x += y - 1; + do_div(x, y); + return x; +} + #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index ee788b1cb364..55992b40353c 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -270,12 +270,12 @@ xfs_read( } } - if (unlikely((ioflags & IO_ISDIRECT) && VN_CACHED(vp))) - bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)), - -1, FI_REMAPF_LOCKED); - - if (unlikely(ioflags & IO_ISDIRECT)) + if (unlikely(ioflags & IO_ISDIRECT)) { + if (VN_CACHED(vp)) + bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)), + -1, FI_REMAPF_LOCKED); mutex_unlock(&inode->i_mutex); + } xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore, (void *)iovp, segs, *offset, ioflags); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 4754f342a5d3..38c4d128a8c0 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -171,7 +171,6 @@ xfs_revalidate_inode( break; } - inode->i_blksize = xfs_preferred_iosize(mp); inode->i_generation = ip->i_d.di_gen; i_size_write(inode, ip->i_d.di_size); inode->i_blocks = @@ -228,7 +227,9 @@ xfs_initialize_vnode( xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip); xfs_set_inodeops(inode); + spin_lock(&ip->i_flags_lock); ip->i_flags &= ~XFS_INEW; + spin_unlock(&ip->i_flags_lock); barrier(); unlock_new_inode(inode); diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h index 91fc2c4b3353..da255bdf5260 100644 --- a/fs/xfs/linux-2.6/xfs_vfs.h +++ b/fs/xfs/linux-2.6/xfs_vfs.h @@ -79,7 +79,7 @@ typedef enum { #define VFS_RDONLY 0x0001 /* read-only vfs */ #define VFS_GRPID 0x0002 /* group-ID assigned from directory */ #define VFS_DMI 0x0004 /* filesystem has the DMI enabled */ -#define VFS_UMOUNT 0x0008 /* unmount in progress */ +/* ---- VFS_UMOUNT ---- 0x0008 -- unneeded, fixed via kthread APIs */ #define VFS_32BITINODES 0x0010 /* do not use inums above 32 bits */ #define VFS_END 0x0010 /* max flag */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c index 6628d96b6fd6..553fa731ade5 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.c +++ b/fs/xfs/linux-2.6/xfs_vnode.c @@ -122,7 +122,6 @@ vn_revalidate_core( inode->i_blocks = vap->va_nblocks; inode->i_mtime = vap->va_mtime; inode->i_ctime = vap->va_ctime; - inode->i_blksize = vap->va_blocksize; if (vap->va_xflags & XFS_XFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; else diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index c42b3221b20c..515f5fdea57a 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -85,8 +85,6 @@ typedef enum { #define VN_BHV_HEAD(vp) ((bhv_head_t *)(&((vp)->v_bh))) #define vn_bhv_head_init(bhp,name) bhv_head_init(bhp,name) #define vn_bhv_remove(bhp,bdp) bhv_remove(bhp,bdp) -#define vn_bhv_lookup(bhp,ops) bhv_lookup(bhp,ops) -#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops) /* * Vnode to Linux inode mapping. diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 5b2dcc58b244..33ad5af386e0 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -382,18 +382,6 @@ xfs_qm_dquot_logitem_unlock( /* - * The transaction with the dquot locked has aborted. The dquot - * must not be dirty within the transaction. We simply unlock just - * as if the transaction had been cancelled. - */ -STATIC void -xfs_qm_dquot_logitem_abort( - xfs_dq_logitem_t *ql) -{ - xfs_qm_dquot_logitem_unlock(ql); -} - -/* * this needs to stamp an lsn into the dquot, I think. * rpc's that look at user dquot's would then have to * push on the dependency recorded in the dquot @@ -426,7 +414,6 @@ STATIC struct xfs_item_ops xfs_dquot_item_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_qm_dquot_logitem_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_abort, .iop_pushbuf = (void(*)(xfs_log_item_t*)) xfs_qm_dquot_logitem_pushbuf, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) @@ -559,17 +546,6 @@ xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn) } /* - * The transaction of which this QUOTAOFF is a part has been aborted. - * Just clean up after ourselves. - * Shouldn't this never happen in the case of qoffend logitems? XXX - */ -STATIC void -xfs_qm_qoff_logitem_abort(xfs_qoff_logitem_t *qf) -{ - kmem_free(qf, sizeof(xfs_qoff_logitem_t)); -} - -/* * There isn't much you can do to push on an quotaoff item. It is simply * stuck waiting for the log to be flushed to disk. */ @@ -644,7 +620,6 @@ STATIC struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_qm_qoffend_logitem_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort, .iop_pushbuf = NULL, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_qm_qoffend_logitem_committing @@ -667,7 +642,6 @@ STATIC struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_qm_qoff_logitem_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort, .iop_pushbuf = NULL, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_qm_qoff_logitem_committing diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index e23e45535c48..7c6a3a50379e 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -112,17 +112,17 @@ xfs_Gqm_init(void) { xfs_dqhash_t *udqhash, *gdqhash; xfs_qm_t *xqm; - uint i, hsize, flags = KM_SLEEP | KM_MAYFAIL; + size_t hsize; + uint i; /* * Initialize the dquot hash tables. */ - hsize = XFS_QM_HASHSIZE_HIGH; - while (!(udqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), flags))) { - if ((hsize >>= 1) <= XFS_QM_HASHSIZE_LOW) - flags = KM_SLEEP; - } - gdqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), KM_SLEEP); + udqhash = kmem_zalloc_greedy(&hsize, + XFS_QM_HASHSIZE_LOW, XFS_QM_HASHSIZE_HIGH, + KM_SLEEP | KM_MAYFAIL | KM_LARGE); + gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE); + hsize /= sizeof(xfs_dqhash_t); ndquot = hsize << 8; xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP); diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index 4568deb6da86..689407de0a20 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -56,12 +56,6 @@ extern kmem_zone_t *qm_dqtrxzone; #define XFS_QM_HASHSIZE_HIGH ((NBPP * 4) / sizeof(xfs_dqhash_t)) /* - * We output a cmn_err when quotachecking a quota file with more than - * this many fsbs. - */ -#define XFS_QM_BIG_QCHECK_NBLKS 500 - -/* * This defines the unit of allocation of dquots. * Currently, it is just one file system block, and a 4K blk contains 30 * (136 * 30 = 4080) dquots. It's probably not worth trying to make diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h index b7ddd04aae32..a8b85e2be9d5 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -75,7 +75,6 @@ static inline int XQMISLCKD(struct xfs_dqhash *h) #define xfs_qm_freelist_lock(qm) XQMLCK(&((qm)->qm_dqfreelist)) #define xfs_qm_freelist_unlock(qm) XQMUNLCK(&((qm)->qm_dqfreelist)) -#define XFS_QM_IS_FREELIST_LOCKED(qm) XQMISLCKD(&((qm)->qm_dqfreelist)) /* * Hash into a bucket in the dquot hash table, based on <mp, id>. @@ -170,6 +169,5 @@ for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \ #define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) -#define DQFLAGTO_DIRTYSTR(d) (XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY") #endif /* __XFS_QUOTA_PRIV_H__ */ diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c index addf5a7ea06c..5cf2e86caa71 100644 --- a/fs/xfs/support/ktrace.c +++ b/fs/xfs/support/ktrace.c @@ -75,7 +75,7 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep) sleep); } else { ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)), - sleep); + sleep | KM_LARGE); } if (ktep == NULL) { diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index dc2361dd740a..9ece7f87ec5b 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -150,7 +150,7 @@ typedef struct xfs_agi { #define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp)) typedef struct xfs_agfl { - xfs_agblock_t agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ + __be32 agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ } xfs_agfl_t; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index d2bbcd882a69..e80dda3437d1 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1477,8 +1477,10 @@ xfs_alloc_ag_vextent_small( /* * Can't allocate from the freelist for some reason. */ - else + else { + fbno = NULLAGBLOCK; flen = 0; + } /* * Can't do the allocation, give up. */ @@ -2021,7 +2023,7 @@ xfs_alloc_get_freelist( /* * Get the block number and update the data structures. */ - bno = INT_GET(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)], ARCH_CONVERT); + bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]); be32_add(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) @@ -2108,7 +2110,7 @@ xfs_alloc_put_freelist( { xfs_agf_t *agf; /* a.g. freespace structure */ xfs_agfl_t *agfl; /* a.g. free block array */ - xfs_agblock_t *blockp;/* pointer to array entry */ + __be32 *blockp;/* pointer to array entry */ int error; #ifdef XFS_ALLOC_TRACE static char fname[] = "xfs_alloc_put_freelist"; @@ -2132,7 +2134,7 @@ xfs_alloc_put_freelist( pag->pagf_flcount++; ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; - INT_SET(*blockp, ARCH_CONVERT, bno); + *blockp = cpu_to_be32(bno); TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); xfs_trans_log_buf(tp, agflbp, diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 7446556e8021..74cadf95d4e8 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -92,6 +92,7 @@ xfs_alloc_delrec( xfs_alloc_key_t *rkp; /* right block key pointer */ xfs_alloc_ptr_t *rpp; /* right block address pointer */ int rrecs=0; /* number of records in right block */ + int numrecs; xfs_alloc_rec_t *rrp; /* right block record pointer */ xfs_btree_cur_t *tcur; /* temporary btree cursor */ @@ -115,7 +116,8 @@ xfs_alloc_delrec( /* * Fail if we're off the end of the block. */ - if (ptr > be16_to_cpu(block->bb_numrecs)) { + numrecs = be16_to_cpu(block->bb_numrecs); + if (ptr > numrecs) { *stat = 0; return 0; } @@ -129,18 +131,18 @@ xfs_alloc_delrec( lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur); lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur); #ifdef DEBUG - for (i = ptr; i < be16_to_cpu(block->bb_numrecs); i++) { + for (i = ptr; i < numrecs; i++) { if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level))) return error; } #endif - if (ptr < be16_to_cpu(block->bb_numrecs)) { + if (ptr < numrecs) { memmove(&lkp[ptr - 1], &lkp[ptr], - (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lkp)); + (numrecs - ptr) * sizeof(*lkp)); memmove(&lpp[ptr - 1], &lpp[ptr], - (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lpp)); - xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1); - xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1); + (numrecs - ptr) * sizeof(*lpp)); + xfs_alloc_log_ptrs(cur, bp, ptr, numrecs - 1); + xfs_alloc_log_keys(cur, bp, ptr, numrecs - 1); } } /* @@ -149,10 +151,10 @@ xfs_alloc_delrec( */ else { lrp = XFS_ALLOC_REC_ADDR(block, 1, cur); - if (ptr < be16_to_cpu(block->bb_numrecs)) { + if (ptr < numrecs) { memmove(&lrp[ptr - 1], &lrp[ptr], - (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lrp)); - xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1); + (numrecs - ptr) * sizeof(*lrp)); + xfs_alloc_log_recs(cur, bp, ptr, numrecs - 1); } /* * If it's the first record in the block, we'll need a key @@ -167,7 +169,8 @@ xfs_alloc_delrec( /* * Decrement and log the number of entries in the block. */ - be16_add(&block->bb_numrecs, -1); + numrecs--; + block->bb_numrecs = cpu_to_be16(numrecs); xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); /* * See if the longest free extent in the allocation group was @@ -181,14 +184,14 @@ xfs_alloc_delrec( if (level == 0 && cur->bc_btnum == XFS_BTNUM_CNT && be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK && - ptr > be16_to_cpu(block->bb_numrecs)) { - ASSERT(ptr == be16_to_cpu(block->bb_numrecs) + 1); + ptr > numrecs) { + ASSERT(ptr == numrecs + 1); /* * There are still records in the block. Grab the size * from the last one. */ - if (be16_to_cpu(block->bb_numrecs)) { - rrp = XFS_ALLOC_REC_ADDR(block, be16_to_cpu(block->bb_numrecs), cur); + if (numrecs) { + rrp = XFS_ALLOC_REC_ADDR(block, numrecs, cur); agf->agf_longest = rrp->ar_blockcount; } /* @@ -211,7 +214,7 @@ xfs_alloc_delrec( * and it's NOT the leaf level, * then we can get rid of this level. */ - if (be16_to_cpu(block->bb_numrecs) == 1 && level > 0) { + if (numrecs == 1 && level > 0) { /* * lpp is still set to the first pointer in the block. * Make it the new root of the btree. @@ -267,7 +270,7 @@ xfs_alloc_delrec( * If the number of records remaining in the block is at least * the minimum, we're done. */ - if (be16_to_cpu(block->bb_numrecs) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) { + if (numrecs >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) { if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i))) return error; *stat = 1; @@ -419,19 +422,21 @@ xfs_alloc_delrec( * See if we can join with the left neighbor block. */ if (lbno != NULLAGBLOCK && - lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + lrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { /* * Set "right" to be the starting block, * "left" to be the left neighbor. */ rbno = bno; right = block; + rrecs = be16_to_cpu(right->bb_numrecs); rbp = bp; if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, lbno, 0, &lbp, XFS_ALLOC_BTREE_REF))) return error; left = XFS_BUF_TO_ALLOC_BLOCK(lbp); + lrecs = be16_to_cpu(left->bb_numrecs); if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) return error; } @@ -439,20 +444,21 @@ xfs_alloc_delrec( * If that won't work, see if we can join with the right neighbor block. */ else if (rbno != NULLAGBLOCK && - rrecs + be16_to_cpu(block->bb_numrecs) <= - XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + rrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { /* * Set "left" to be the starting block, * "right" to be the right neighbor. */ lbno = bno; left = block; + lrecs = be16_to_cpu(left->bb_numrecs); lbp = bp; if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, rbno, 0, &rbp, XFS_ALLOC_BTREE_REF))) return error; right = XFS_BUF_TO_ALLOC_BLOCK(rbp); + rrecs = be16_to_cpu(right->bb_numrecs); if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) return error; } @@ -474,34 +480,28 @@ xfs_alloc_delrec( /* * It's a non-leaf. Move keys and pointers. */ - lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur); - lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur); + lkp = XFS_ALLOC_KEY_ADDR(left, lrecs + 1, cur); + lpp = XFS_ALLOC_PTR_ADDR(left, lrecs + 1, cur); rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); #ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { + for (i = 0; i < rrecs; i++) { if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level))) return error; } #endif - memcpy(lkp, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*lkp)); - memcpy(lpp, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*lpp)); - xfs_alloc_log_keys(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1, - be16_to_cpu(left->bb_numrecs) + - be16_to_cpu(right->bb_numrecs)); - xfs_alloc_log_ptrs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1, - be16_to_cpu(left->bb_numrecs) + - be16_to_cpu(right->bb_numrecs)); + memcpy(lkp, rkp, rrecs * sizeof(*lkp)); + memcpy(lpp, rpp, rrecs * sizeof(*lpp)); + xfs_alloc_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); + xfs_alloc_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); } else { /* * It's a leaf. Move records. */ - lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur); + lrp = XFS_ALLOC_REC_ADDR(left, lrecs + 1, cur); rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); - memcpy(lrp, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*lrp)); - xfs_alloc_log_recs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1, - be16_to_cpu(left->bb_numrecs) + - be16_to_cpu(right->bb_numrecs)); + memcpy(lrp, rrp, rrecs * sizeof(*lrp)); + xfs_alloc_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); } /* * If we joined with the left neighbor, set the buffer in the @@ -509,7 +509,7 @@ xfs_alloc_delrec( */ if (bp != lbp) { xfs_btree_setbuf(cur, level, lbp); - cur->bc_ptrs[level] += be16_to_cpu(left->bb_numrecs); + cur->bc_ptrs[level] += lrecs; } /* * If we joined with the right neighbor and there's a level above @@ -521,7 +521,8 @@ xfs_alloc_delrec( /* * Fix up the number of records in the surviving block. */ - be16_add(&left->bb_numrecs, be16_to_cpu(right->bb_numrecs)); + lrecs += rrecs; + left->bb_numrecs = cpu_to_be16(lrecs); /* * Fix up the right block pointer in the surviving block, and log it. */ @@ -608,6 +609,7 @@ xfs_alloc_insrec( xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */ xfs_alloc_key_t nkey; /* new key value, from split */ xfs_alloc_rec_t nrec; /* new record value, for caller */ + int numrecs; int optr; /* old ptr value */ xfs_alloc_ptr_t *pp; /* pointer to btree addresses */ int ptr; /* index in btree block for this rec */ @@ -653,13 +655,14 @@ xfs_alloc_insrec( */ bp = cur->bc_bufs[level]; block = XFS_BUF_TO_ALLOC_BLOCK(bp); + numrecs = be16_to_cpu(block->bb_numrecs); #ifdef DEBUG if ((error = xfs_btree_check_sblock(cur, block, level, bp))) return error; /* * Check that the new entry is being inserted in the right place. */ - if (ptr <= be16_to_cpu(block->bb_numrecs)) { + if (ptr <= numrecs) { if (level == 0) { rp = XFS_ALLOC_REC_ADDR(block, ptr, cur); xfs_btree_check_rec(cur->bc_btnum, recp, rp); @@ -670,12 +673,12 @@ xfs_alloc_insrec( } #endif nbno = NULLAGBLOCK; - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; /* * If the block is full, we can't insert the new entry until we * make the block un-full. */ - if (be16_to_cpu(block->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { + if (numrecs == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { /* * First, try shifting an entry to the right neighbor. */ @@ -729,6 +732,7 @@ xfs_alloc_insrec( * At this point we know there's room for our new entry in the block * we're pointing at. */ + numrecs = be16_to_cpu(block->bb_numrecs); if (level > 0) { /* * It's a non-leaf entry. Make a hole for the new data @@ -737,15 +741,15 @@ xfs_alloc_insrec( kp = XFS_ALLOC_KEY_ADDR(block, 1, cur); pp = XFS_ALLOC_PTR_ADDR(block, 1, cur); #ifdef DEBUG - for (i = be16_to_cpu(block->bb_numrecs); i >= ptr; i--) { + for (i = numrecs; i >= ptr; i--) { if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level))) return error; } #endif memmove(&kp[ptr], &kp[ptr - 1], - (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*kp)); + (numrecs - ptr + 1) * sizeof(*kp)); memmove(&pp[ptr], &pp[ptr - 1], - (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*pp)); + (numrecs - ptr + 1) * sizeof(*pp)); #ifdef DEBUG if ((error = xfs_btree_check_sptr(cur, *bnop, level))) return error; @@ -755,11 +759,12 @@ xfs_alloc_insrec( */ kp[ptr - 1] = key; pp[ptr - 1] = cpu_to_be32(*bnop); - be16_add(&block->bb_numrecs, 1); - xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs)); - xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs)); + numrecs++; + block->bb_numrecs = cpu_to_be16(numrecs); + xfs_alloc_log_keys(cur, bp, ptr, numrecs); + xfs_alloc_log_ptrs(cur, bp, ptr, numrecs); #ifdef DEBUG - if (ptr < be16_to_cpu(block->bb_numrecs)) + if (ptr < numrecs) xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1, kp + ptr); #endif @@ -769,16 +774,17 @@ xfs_alloc_insrec( */ rp = XFS_ALLOC_REC_ADDR(block, 1, cur); memmove(&rp[ptr], &rp[ptr - 1], - (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*rp)); + (numrecs - ptr + 1) * sizeof(*rp)); /* * Now stuff the new record in, bump numrecs * and log the new data. */ - rp[ptr - 1] = *recp; /* INT_: struct copy */ - be16_add(&block->bb_numrecs, 1); - xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs)); + rp[ptr - 1] = *recp; + numrecs++; + block->bb_numrecs = cpu_to_be16(numrecs); + xfs_alloc_log_recs(cur, bp, ptr, numrecs); #ifdef DEBUG - if (ptr < be16_to_cpu(block->bb_numrecs)) + if (ptr < numrecs) xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1, rp + ptr); #endif @@ -819,8 +825,8 @@ xfs_alloc_insrec( */ *bnop = nbno; if (nbno != NULLAGBLOCK) { - *recp = nrec; /* INT_: struct copy */ - *curp = ncur; /* INT_: struct copy */ + *recp = nrec; + *curp = ncur; } *stat = 1; return 0; @@ -981,7 +987,7 @@ xfs_alloc_lookup( */ bp = cur->bc_bufs[level]; if (bp && XFS_BUF_ADDR(bp) != d) - bp = (xfs_buf_t *)0; + bp = NULL; if (!bp) { /* * Need to get a new buffer. Read it, then @@ -1229,7 +1235,7 @@ xfs_alloc_lshift( if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level))) return error; #endif - *lpp = *rpp; /* INT_: copy */ + *lpp = *rpp; xfs_alloc_log_ptrs(cur, lbp, nrec, nrec); xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp); } @@ -1406,8 +1412,8 @@ xfs_alloc_newroot( kp = XFS_ALLOC_KEY_ADDR(new, 1, cur); if (be16_to_cpu(left->bb_level) > 0) { - kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */ - kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */ + kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); + kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur); } else { xfs_alloc_rec_t *rp; /* btree record pointer */ @@ -1527,8 +1533,8 @@ xfs_alloc_rshift( if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level))) return error; #endif - *rkp = *lkp; /* INT_: copy */ - *rpp = *lpp; /* INT_: copy */ + *rkp = *lkp; + *rpp = *lpp; xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1); @@ -2044,7 +2050,7 @@ xfs_alloc_insert( nbno = NULLAGBLOCK; nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; pcur = cur; /* * Loop going up the tree, starting at the leaf level. @@ -2076,7 +2082,7 @@ xfs_alloc_insert( */ if (ncur) { pcur = ncur; - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; } } while (nbno != NULLAGBLOCK); *stat = i; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 1a2101043275..9ada7bdbae52 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -91,7 +91,6 @@ STATIC int xfs_attr_refillstate(xfs_da_state_t *state); /* * Routines to manipulate out-of-line attribute values. */ -STATIC int xfs_attr_rmtval_get(xfs_da_args_t *args); STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args); STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); @@ -180,7 +179,7 @@ xfs_attr_get(bhv_desc_t *bdp, const char *name, char *value, int *valuelenp, return(error); } -STATIC int +int xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, char *value, int valuelen, int flags) { @@ -440,7 +439,7 @@ xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int f * Generic handler routine to remove a name from an attribute list. * Transitions attribute list from Btree to shortform as necessary. */ -STATIC int +int xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags) { xfs_da_args_t args; @@ -591,6 +590,110 @@ xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred) return xfs_attr_remove_int(dp, name, namelen, flags); } +int /* error */ +xfs_attr_list_int(xfs_attr_list_context_t *context) +{ + int error; + xfs_inode_t *dp = context->dp; + + /* + * Decide on what work routines to call based on the inode size. + */ + if (XFS_IFORK_Q(dp) == 0 || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + error = 0; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_list(context); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_list(context); + } else { + error = xfs_attr_node_list(context); + } + return error; +} + +#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ + (((struct attrlist_ent *) 0)->a_name - (char *) 0) +#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ + ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ + & ~(sizeof(u_int32_t)-1)) + +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +/*ARGSUSED*/ +STATIC int +xfs_attr_put_listent(xfs_attr_list_context_t *context, attrnames_t *namesp, + char *name, int namelen, + int valuelen, char *value) +{ + attrlist_ent_t *aep; + int arraytop; + + ASSERT(!(context->flags & ATTR_KERNOVAL)); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*context->alist)); + ASSERT(context->firstu <= context->bufsize); + + arraytop = sizeof(*context->alist) + + context->count * sizeof(context->alist->al_offset[0]); + context->firstu -= ATTR_ENTSIZE(namelen); + if (context->firstu < arraytop) { + xfs_attr_trace_l_c("buffer full", context); + context->alist->al_more = 1; + context->seen_enough = 1; + return 1; + } + + aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]); + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[ namelen ] = 0; + context->alist->al_offset[ context->count++ ] = context->firstu; + context->alist->al_count = context->count; + xfs_attr_trace_l_c("add", context); + return 0; +} + +STATIC int +xfs_attr_kern_list(xfs_attr_list_context_t *context, attrnames_t *namesp, + char *name, int namelen, + int valuelen, char *value) +{ + char *offset; + int arraytop; + + ASSERT(context->count >= 0); + + arraytop = context->count + namesp->attr_namelen + namelen + 1; + if (arraytop > context->firstu) { + context->count = -1; /* insufficient space */ + return 1; + } + offset = (char *)context->alist + context->count; + strncpy(offset, namesp->attr_name, namesp->attr_namelen); + offset += namesp->attr_namelen; + strncpy(offset, name, namelen); /* real name */ + offset += namelen; + *offset = '\0'; + context->count += namesp->attr_namelen + namelen + 1; + return 0; +} + +/*ARGSUSED*/ +STATIC int +xfs_attr_kern_list_sizes(xfs_attr_list_context_t *context, attrnames_t *namesp, + char *name, int namelen, + int valuelen, char *value) +{ + context->count += namesp->attr_namelen + namelen + 1; + return 0; +} + /* * Generate a list of extended attribute names and optionally * also value lengths. Positive return value follows the XFS @@ -615,13 +718,13 @@ xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags, return(XFS_ERROR(EINVAL)); if ((cursor->initted == 0) && (cursor->hashval || cursor->blkno || cursor->offset)) - return(XFS_ERROR(EINVAL)); + return XFS_ERROR(EINVAL); /* * Check for a properly aligned buffer. */ if (((long)buffer) & (sizeof(int)-1)) - return(XFS_ERROR(EFAULT)); + return XFS_ERROR(EFAULT); if (flags & ATTR_KERNOVAL) bufsize = 0; @@ -634,53 +737,47 @@ xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags, context.dupcnt = 0; context.resynch = 1; context.flags = flags; - if (!(flags & ATTR_KERNAMELS)) { + context.seen_enough = 0; + context.alist = (attrlist_t *)buffer; + context.put_value = 0; + + if (flags & ATTR_KERNAMELS) { + context.bufsize = bufsize; + context.firstu = context.bufsize; + if (flags & ATTR_KERNOVAL) + context.put_listent = xfs_attr_kern_list_sizes; + else + context.put_listent = xfs_attr_kern_list; + } else { context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ context.firstu = context.bufsize; - context.alist = (attrlist_t *)buffer; context.alist->al_count = 0; context.alist->al_more = 0; context.alist->al_offset[0] = context.bufsize; - } - else { - context.bufsize = bufsize; - context.firstu = context.bufsize; - context.alist = (attrlist_t *)buffer; + context.put_listent = xfs_attr_put_listent; } if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return (EIO); + return EIO; xfs_ilock(dp, XFS_ILOCK_SHARED); - /* - * Decide on what work routines to call based on the inode size. - */ xfs_attr_trace_l_c("syscall start", &context); - if (XFS_IFORK_Q(dp) == 0 || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { - error = 0; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = xfs_attr_shortform_list(&context); - } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - error = xfs_attr_leaf_list(&context); - } else { - error = xfs_attr_node_list(&context); - } + + error = xfs_attr_list_int(&context); + xfs_iunlock(dp, XFS_ILOCK_SHARED); xfs_attr_trace_l_c("syscall end", &context); - if (!(context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS))) { - ASSERT(error >= 0); - } - else { /* must return negated buffer size or the error */ + if (context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS)) { + /* must return negated buffer size or the error */ if (context.count < 0) error = XFS_ERROR(ERANGE); else error = -context.count; - } + } else + ASSERT(error >= 0); - return(error); + return error; } int /* error */ @@ -1122,19 +1219,19 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) context->cursor->blkno = 0; error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); if (error) - return(error); + return XFS_ERROR(error); ASSERT(bp != NULL); leaf = bp->data; if (unlikely(be16_to_cpu(leaf->hdr.info.magic) != XFS_ATTR_LEAF_MAGIC)) { XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, context->dp->i_mount, leaf); xfs_da_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); + return XFS_ERROR(EFSCORRUPTED); } - (void)xfs_attr_leaf_list_int(bp, context); + error = xfs_attr_leaf_list_int(bp, context); xfs_da_brelse(NULL, bp); - return(0); + return XFS_ERROR(error); } @@ -1858,8 +1955,12 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) return(XFS_ERROR(EFSCORRUPTED)); } error = xfs_attr_leaf_list_int(bp, context); - if (error || !leaf->hdr.info.forw) - break; /* not really an error, buffer full or EOF */ + if (error) { + xfs_da_brelse(NULL, bp); + return error; + } + if (context->seen_enough || leaf->hdr.info.forw == 0) + break; cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); xfs_da_brelse(NULL, bp); error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, @@ -1886,7 +1987,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) * Read the value associated with an attribute from the out-of-line buffer * that we stored it in. */ -STATIC int +int xfs_attr_rmtval_get(xfs_da_args_t *args) { xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index 981633f6c077..783977d3ea71 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -37,6 +37,7 @@ struct cred; struct bhv_vnode; +struct xfs_attr_list_context; typedef int (*attrset_t)(struct bhv_vnode *, char *, void *, size_t, int); typedef int (*attrget_t)(struct bhv_vnode *, char *, void *, size_t, int); @@ -160,13 +161,16 @@ struct xfs_da_args; */ int xfs_attr_get(bhv_desc_t *, const char *, char *, int *, int, struct cred *); int xfs_attr_set(bhv_desc_t *, const char *, char *, int, int, struct cred *); +int xfs_attr_set_int(struct xfs_inode *, const char *, int, char *, int, int); int xfs_attr_remove(bhv_desc_t *, const char *, int, struct cred *); -int xfs_attr_list(bhv_desc_t *, char *, int, int, - struct attrlist_cursor_kern *, struct cred *); +int xfs_attr_remove_int(struct xfs_inode *, const char *, int, int); +int xfs_attr_list(bhv_desc_t *, char *, int, int, struct attrlist_cursor_kern *, struct cred *); +int xfs_attr_list_int(struct xfs_attr_list_context *); int xfs_attr_inactive(struct xfs_inode *dp); int xfs_attr_shortform_getvalue(struct xfs_da_args *); int xfs_attr_fetch(struct xfs_inode *, const char *, int, char *, int *, int, struct cred *); +int xfs_attr_rmtval_get(struct xfs_da_args *args); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 9455051f0120..9719bbef122c 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -89,9 +89,46 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, int dst_start, int move_count, xfs_mount_t *mp); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -STATIC int xfs_attr_put_listent(xfs_attr_list_context_t *context, - attrnames_t *, char *name, int namelen, - int valuelen); + +/*======================================================================== + * Namespace helper routines + *========================================================================*/ + +STATIC inline attrnames_t * +xfs_attr_flags_namesp(int flags) +{ + return ((flags & XFS_ATTR_SECURE) ? &attr_secure: + ((flags & XFS_ATTR_ROOT) ? &attr_trusted : &attr_user)); +} + +/* + * If namespace bits don't match return 0. + * If all match then return 1. + */ +STATIC inline int +xfs_attr_namesp_match(int arg_flags, int ondisk_flags) +{ + return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); +} + +/* + * If namespace bits don't match and we don't have an override for it + * then return 0. + * If all match or are overridable then return 1. + */ +STATIC inline int +xfs_attr_namesp_match_overrides(int arg_flags, int ondisk_flags) +{ + if (((arg_flags & ATTR_SECURE) == 0) != + ((ondisk_flags & XFS_ATTR_SECURE) == 0) && + !(arg_flags & ATTR_KERNORMALS)) + return 0; + if (((arg_flags & ATTR_ROOT) == 0) != + ((ondisk_flags & XFS_ATTR_ROOT) == 0) && + !(arg_flags & ATTR_KERNROOTLS)) + return 0; + return 1; +} /*======================================================================== @@ -228,11 +265,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) continue; if (memcmp(args->name, sfe->nameval, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; ASSERT(0); #endif @@ -246,8 +279,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; - sfe->flags = (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE : - ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0); + sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); sf->hdr.count++; @@ -282,11 +314,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) continue; if (memcmp(sfe->nameval, args->name, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; break; } @@ -363,11 +391,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) continue; if (memcmp(args->name, sfe->nameval, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; return(XFS_ERROR(EEXIST)); } @@ -394,11 +418,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) continue; if (memcmp(args->name, sfe->nameval, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; if (args->flags & ATTR_KERNOVAL) { args->valuelen = sfe->valuelen; @@ -485,8 +505,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) nargs.valuelen = sfe->valuelen; nargs.hashval = xfs_da_hashname((char *)sfe->nameval, sfe->namelen); - nargs.flags = (sfe->flags & XFS_ATTR_SECURE) ? ATTR_SECURE : - ((sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0); + nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == ENOATTR); error = xfs_attr_leaf_add(bp, &nargs); @@ -520,6 +539,10 @@ xfs_attr_shortform_compare(const void *a, const void *b) } } + +#define XFS_ISRESET_CURSOR(cursor) \ + (!((cursor)->initted) && !((cursor)->hashval) && \ + !((cursor)->blkno) && !((cursor)->offset)) /* * Copy out entries of shortform attribute lists for attr_list(). * Shortform attribute lists are not stored in hashval sorted order. @@ -537,6 +560,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) xfs_attr_sf_entry_t *sfe; xfs_inode_t *dp; int sbsize, nsbuf, count, i; + int error; ASSERT(context != NULL); dp = context->dp; @@ -552,46 +576,51 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) xfs_attr_trace_l_c("sf start", context); /* - * If the buffer is large enough, do not bother with sorting. + * If the buffer is large enough and the cursor is at the start, + * do not bother with sorting since we will return everything in + * one buffer and another call using the cursor won't need to be + * made. * Note the generous fudge factor of 16 overhead bytes per entry. + * If bufsize is zero then put_listent must be a search function + * and can just scan through what we have. */ - if ((dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize) { + if (context->bufsize == 0 || + (XFS_ISRESET_CURSOR(cursor) && + (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { attrnames_t *namesp; - if (((context->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0) && - !(context->flags & ATTR_KERNORMALS)) { - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - continue; - } - if (((context->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0) && - !(context->flags & ATTR_KERNROOTLS)) { + if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) { sfe = XFS_ATTR_SF_NEXTENTRY(sfe); continue; } - namesp = (sfe->flags & XFS_ATTR_SECURE) ? &attr_secure: - ((sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted : - &attr_user); - if (context->flags & ATTR_KERNOVAL) { - ASSERT(context->flags & ATTR_KERNAMELS); - context->count += namesp->attr_namelen + - sfe->namelen + 1; - } - else { - if (xfs_attr_put_listent(context, namesp, - (char *)sfe->nameval, - (int)sfe->namelen, - (int)sfe->valuelen)) - break; - } + namesp = xfs_attr_flags_namesp(sfe->flags); + error = context->put_listent(context, + namesp, + (char *)sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen, + (char*)&sfe->nameval[sfe->namelen]); + + /* + * Either search callback finished early or + * didn't fit it all in the buffer after all. + */ + if (context->seen_enough) + break; + + if (error) + return error; sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } xfs_attr_trace_l_c("sf big-gulp", context); return(0); } + /* do no more for a search callback */ + if (context->bufsize == 0) + return 0; + /* * It didn't all fit, so we have to sort everything on hashval. */ @@ -614,15 +643,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) kmem_free(sbuf, sbsize); return XFS_ERROR(EFSCORRUPTED); } - if (((context->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0) && - !(context->flags & ATTR_KERNORMALS)) { - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - continue; - } - if (((context->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0) && - !(context->flags & ATTR_KERNROOTLS)) { + if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) { sfe = XFS_ATTR_SF_NEXTENTRY(sfe); continue; } @@ -671,24 +692,22 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) for ( ; i < nsbuf; i++, sbp++) { attrnames_t *namesp; - namesp = (sbp->flags & XFS_ATTR_SECURE) ? &attr_secure : - ((sbp->flags & XFS_ATTR_ROOT) ? &attr_trusted : - &attr_user); + namesp = xfs_attr_flags_namesp(sbp->flags); if (cursor->hashval != sbp->hash) { cursor->hashval = sbp->hash; cursor->offset = 0; } - if (context->flags & ATTR_KERNOVAL) { - ASSERT(context->flags & ATTR_KERNAMELS); - context->count += namesp->attr_namelen + - sbp->namelen + 1; - } else { - if (xfs_attr_put_listent(context, namesp, - sbp->name, sbp->namelen, - sbp->valuelen)) - break; - } + error = context->put_listent(context, + namesp, + sbp->name, + sbp->namelen, + sbp->valuelen, + &sbp->name[sbp->namelen]); + if (error) + return error; + if (context->seen_enough) + break; cursor->offset++; } @@ -810,8 +829,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) nargs.value = (char *)&name_loc->nameval[nargs.namelen]; nargs.valuelen = be16_to_cpu(name_loc->valuelen); nargs.hashval = be32_to_cpu(entry->hashval); - nargs.flags = (entry->flags & XFS_ATTR_SECURE) ? ATTR_SECURE : - ((entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0); + nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); xfs_attr_shortform_add(&nargs, forkoff); } error = 0; @@ -1098,8 +1116,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) be16_to_cpu(map->size)); entry->hashval = cpu_to_be32(args->hashval); entry->flags = tmp ? XFS_ATTR_LOCAL : 0; - entry->flags |= (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE : - ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0); + entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); if (args->rename) { entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && @@ -1926,7 +1943,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) else break; } - ASSERT((probe >= 0) && + ASSERT((probe >= 0) && (!leaf->hdr.count || (probe < be16_to_cpu(leaf->hdr.count)))); ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval)); @@ -1971,14 +1988,9 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe); if (name_loc->namelen != args->namelen) continue; - if (memcmp(args->name, (char *)name_loc->nameval, - args->namelen) != 0) + if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((entry->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((entry->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; return(XFS_ERROR(EEXIST)); @@ -1989,11 +2001,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) if (memcmp(args->name, (char *)name_rmt->name, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((entry->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((entry->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; args->rmtblkno = be32_to_cpu(name_rmt->valueblk); @@ -2312,8 +2320,6 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) attrlist_cursor_kern_t *cursor; xfs_attr_leafblock_t *leaf; xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; int retval, i; ASSERT(bp != NULL); @@ -2355,9 +2361,8 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) * We have found our place, start copying out the new attributes. */ retval = 0; - for ( ; (i < be16_to_cpu(leaf->hdr.count)) - && (retval == 0); entry++, i++) { - attrnames_t *namesp; + for ( ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) { + attrnames_t *namesp; if (be32_to_cpu(entry->hashval) != cursor->hashval) { cursor->hashval = be32_to_cpu(entry->hashval); @@ -2366,115 +2371,69 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* skip incomplete entries */ - if (((context->flags & ATTR_SECURE) != 0) != - ((entry->flags & XFS_ATTR_SECURE) != 0) && - !(context->flags & ATTR_KERNORMALS)) - continue; /* skip non-matching entries */ - if (((context->flags & ATTR_ROOT) != 0) != - ((entry->flags & XFS_ATTR_ROOT) != 0) && - !(context->flags & ATTR_KERNROOTLS)) - continue; /* skip non-matching entries */ - - namesp = (entry->flags & XFS_ATTR_SECURE) ? &attr_secure : - ((entry->flags & XFS_ATTR_ROOT) ? &attr_trusted : - &attr_user); + if (!xfs_attr_namesp_match_overrides(context->flags, entry->flags)) + continue; + + namesp = xfs_attr_flags_namesp(entry->flags); if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); - if (context->flags & ATTR_KERNOVAL) { - ASSERT(context->flags & ATTR_KERNAMELS); - context->count += namesp->attr_namelen + - (int)name_loc->namelen + 1; - } else { - retval = xfs_attr_put_listent(context, namesp, - (char *)name_loc->nameval, - (int)name_loc->namelen, - be16_to_cpu(name_loc->valuelen)); - } + xfs_attr_leaf_name_local_t *name_loc = + XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); + + retval = context->put_listent(context, + namesp, + (char *)name_loc->nameval, + (int)name_loc->namelen, + be16_to_cpu(name_loc->valuelen), + (char *)&name_loc->nameval[name_loc->namelen]); + if (retval) + return retval; } else { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); - if (context->flags & ATTR_KERNOVAL) { - ASSERT(context->flags & ATTR_KERNAMELS); - context->count += namesp->attr_namelen + - (int)name_rmt->namelen + 1; - } else { - retval = xfs_attr_put_listent(context, namesp, - (char *)name_rmt->name, - (int)name_rmt->namelen, - be32_to_cpu(name_rmt->valuelen)); + xfs_attr_leaf_name_remote_t *name_rmt = + XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); + + int valuelen = be32_to_cpu(name_rmt->valuelen); + + if (context->put_value) { + xfs_da_args_t args; + + memset((char *)&args, 0, sizeof(args)); + args.dp = context->dp; + args.whichfork = XFS_ATTR_FORK; + args.valuelen = valuelen; + args.value = kmem_alloc(valuelen, KM_SLEEP); + args.rmtblkno = be32_to_cpu(name_rmt->valueblk); + args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); + retval = xfs_attr_rmtval_get(&args); + if (retval) + return retval; + retval = context->put_listent(context, + namesp, + (char *)name_rmt->name, + (int)name_rmt->namelen, + valuelen, + (char*)args.value); + kmem_free(args.value, valuelen); } + else { + retval = context->put_listent(context, + namesp, + (char *)name_rmt->name, + (int)name_rmt->namelen, + valuelen, + NULL); + } + if (retval) + return retval; } - if (retval == 0) { - cursor->offset++; - } + if (context->seen_enough) + break; + cursor->offset++; } xfs_attr_trace_l_cl("blk end", context, leaf); return(retval); } -#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ - (((struct attrlist_ent *) 0)->a_name - (char *) 0) -#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ - ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ - & ~(sizeof(u_int32_t)-1)) - -/* - * Format an attribute and copy it out to the user's buffer. - * Take care to check values and protect against them changing later, - * we may be reading them directly out of a user buffer. - */ -/*ARGSUSED*/ -STATIC int -xfs_attr_put_listent(xfs_attr_list_context_t *context, - attrnames_t *namesp, char *name, int namelen, int valuelen) -{ - attrlist_ent_t *aep; - int arraytop; - - ASSERT(!(context->flags & ATTR_KERNOVAL)); - if (context->flags & ATTR_KERNAMELS) { - char *offset; - - ASSERT(context->count >= 0); - - arraytop = context->count + namesp->attr_namelen + namelen + 1; - if (arraytop > context->firstu) { - context->count = -1; /* insufficient space */ - return(1); - } - offset = (char *)context->alist + context->count; - strncpy(offset, namesp->attr_name, namesp->attr_namelen); - offset += namesp->attr_namelen; - strncpy(offset, name, namelen); /* real name */ - offset += namelen; - *offset = '\0'; - context->count += namesp->attr_namelen + namelen + 1; - return(0); - } - - ASSERT(context->count >= 0); - ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); - ASSERT(context->firstu >= sizeof(*context->alist)); - ASSERT(context->firstu <= context->bufsize); - - arraytop = sizeof(*context->alist) + - context->count * sizeof(context->alist->al_offset[0]); - context->firstu -= ATTR_ENTSIZE(namelen); - if (context->firstu < arraytop) { - xfs_attr_trace_l_c("buffer full", context); - context->alist->al_more = 1; - return(1); - } - - aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]); - aep->a_valuelen = valuelen; - memcpy(aep->a_name, name, namelen); - aep->a_name[ namelen ] = 0; - context->alist->al_offset[ context->count++ ] = context->firstu; - context->alist->al_count = context->count; - xfs_attr_trace_l_c("add", context); - return(0); -} /*======================================================================== * Manage the INCOMPLETE flag in a leaf entry diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 51c3ee156b2f..040f732ce1e2 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -130,6 +130,19 @@ typedef struct xfs_attr_leafblock { #define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) /* + * Conversion macros for converting namespace bits from argument flags + * to ondisk flags. + */ +#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) +#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) +#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ + ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) +#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ + ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) + +/* * Alignment for namelist and valuelist entries (since they are mixed * there can be only one alignment value) */ @@ -196,16 +209,26 @@ static inline int xfs_attr_leaf_entsize_local_max(int bsize) * Structure used to pass context around among the routines. *========================================================================*/ + +struct xfs_attr_list_context; + +typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, struct attrnames *, + char *, int, int, char *); + typedef struct xfs_attr_list_context { - struct xfs_inode *dp; /* inode */ - struct attrlist_cursor_kern *cursor;/* position in list */ - struct attrlist *alist; /* output buffer */ - int count; /* num used entries */ - int dupcnt; /* count dup hashvals seen */ - int bufsize;/* total buffer size */ - int firstu; /* first used byte in buffer */ - int flags; /* from VOP call */ - int resynch;/* T/F: resynch with cursor */ + struct xfs_inode *dp; /* inode */ + struct attrlist_cursor_kern *cursor; /* position in list */ + struct attrlist *alist; /* output buffer */ + int seen_enough; /* T/F: seen enough of list? */ + int count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + int flags; /* from VOP call */ + int resynch; /* T/F: resynch with cursor */ + int put_value; /* T/F: need value for listent */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ } xfs_attr_list_context_t; /* diff --git a/fs/xfs/xfs_behavior.c b/fs/xfs/xfs_behavior.c index f4fe3715a803..0dc17219d412 100644 --- a/fs/xfs/xfs_behavior.c +++ b/fs/xfs/xfs_behavior.c @@ -110,26 +110,6 @@ bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp) } /* - * Look for a specific ops vector on the specified behavior chain. - * Return the associated behavior descriptor. Or NULL, if not found. - */ -bhv_desc_t * -bhv_lookup(bhv_head_t *bhp, void *ops) -{ - bhv_desc_t *curdesc; - - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - if (curdesc->bd_ops == ops) - return curdesc; - } - - return NULL; -} - -/* * Looks for the first behavior within a specified range of positions. * Return the associated behavior descriptor. Or NULL, if none found. */ diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h index 6e6e56fb352d..e7ca1fed955a 100644 --- a/fs/xfs/xfs_behavior.h +++ b/fs/xfs/xfs_behavior.h @@ -176,12 +176,10 @@ extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *); * Behavior module prototypes. */ extern void bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp); -extern bhv_desc_t * bhv_lookup(bhv_head_t *bhp, void *ops); extern bhv_desc_t * bhv_lookup_range(bhv_head_t *bhp, int low, int high); extern bhv_desc_t * bhv_base(bhv_head_t *bhp); /* No bhv locking on Linux */ -#define bhv_lookup_unlocked bhv_lookup #define bhv_base_unlocked bhv_base #endif /* __XFS_BEHAVIOR_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index bf46fae303af..5b050c06795f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2999,7 +2999,7 @@ xfs_bmap_btree_to_extents( int error; /* error return value */ xfs_ifork_t *ifp; /* inode fork data */ xfs_mount_t *mp; /* mount point structure */ - xfs_bmbt_ptr_t *pp; /* ptr to block address */ + __be64 *pp; /* ptr to block address */ xfs_bmbt_block_t *rblock;/* root btree block */ ifp = XFS_IFORK_PTR(ip, whichfork); @@ -3011,12 +3011,12 @@ xfs_bmap_btree_to_extents( ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1); mp = ip->i_mount; pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes); + cbno = be64_to_cpu(*pp); *logflagsp = 0; #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), 1))) + if ((error = xfs_btree_check_lptr(cur, cbno, 1))) return error; #endif - cbno = INT_GET(*pp, ARCH_CONVERT); if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF))) return error; @@ -3512,9 +3512,9 @@ xfs_bmap_extents_to_btree( */ kp = XFS_BMAP_KEY_IADDR(block, 1, cur); arp = XFS_BMAP_REC_IADDR(ablock, 1, cur); - INT_SET(kp->br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(arp)); + kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); pp = XFS_BMAP_PTR_IADDR(block, 1, cur); - INT_SET(*pp, ARCH_CONVERT, args.fsbno); + *pp = cpu_to_be64(args.fsbno); /* * Do all this logging at the end so that * the root is at the right level. @@ -3705,7 +3705,7 @@ STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */ xfs_bmap_search_extents( xfs_inode_t *ip, /* incore inode pointer */ xfs_fileoff_t bno, /* block number searched for */ - int whichfork, /* data or attr fork */ + int fork, /* data or attr fork */ int *eofp, /* out: end of file found */ xfs_extnum_t *lastxp, /* out: last extent index */ xfs_bmbt_irec_t *gotp, /* out: extent entry found */ @@ -3713,25 +3713,28 @@ xfs_bmap_search_extents( { xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_rec_t *ep; /* extent record pointer */ - int rt; /* realtime flag */ XFS_STATS_INC(xs_look_exlist); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = XFS_IFORK_PTR(ip, fork); ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); - rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); - if (unlikely(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM))) { - cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld " - "start_block : %llx start_off : %llx blkcnt : %llx " - "extent-state : %x \n", - (ip->i_mount)->m_fsname, (long long)ip->i_ino, + if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && + !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { + xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x lastx: %x\n", + (unsigned long long)ip->i_ino, (unsigned long long)gotp->br_startblock, (unsigned long long)gotp->br_startoff, (unsigned long long)gotp->br_blockcount, - gotp->br_state); - } - return ep; + gotp->br_state, *lastxp); + *lastxp = NULLEXTNUM; + *eofp = 1; + return NULL; + } + return ep; } @@ -4494,7 +4497,7 @@ xfs_bmap_read_extents( xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ xfs_mount_t *mp; /* file system mount structure */ - xfs_bmbt_ptr_t *pp; /* pointer to block address */ + __be64 *pp; /* pointer to block address */ /* REFERENCED */ xfs_extnum_t room; /* number of entries there's room for */ @@ -4510,10 +4513,10 @@ xfs_bmap_read_extents( level = be16_to_cpu(block->bb_level); ASSERT(level > 0); pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); - ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); - bno = INT_GET(*pp, ARCH_CONVERT); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); /* * Go down the tree until leaf level is reached, following the first * pointer (leftmost) at each level. @@ -4530,10 +4533,8 @@ xfs_bmap_read_extents( break; pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); - XFS_WANT_CORRUPTED_GOTO( - XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), - error0); - bno = INT_GET(*pp, ARCH_CONVERT); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); xfs_trans_brelse(tp, bp); } /* @@ -6141,7 +6142,7 @@ xfs_check_block( short sz) { int i, j, dmxr; - xfs_bmbt_ptr_t *pp, *thispa; /* pointer to block address */ + __be64 *pp, *thispa; /* pointer to block address */ xfs_bmbt_key_t *prevp, *keyp; ASSERT(be16_to_cpu(block->bb_level) > 0); @@ -6179,11 +6180,10 @@ xfs_check_block( thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, j, dmxr); } - if (INT_GET(*thispa, ARCH_CONVERT) == - INT_GET(*pp, ARCH_CONVERT)) { + if (*thispa == *pp) { cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", __FUNCTION__, j, i, - INT_GET(*thispa, ARCH_CONVERT)); + (unsigned long long)be64_to_cpu(*thispa)); panic("%s: ptrs are equal in node\n", __FUNCTION__); } @@ -6210,7 +6210,7 @@ xfs_bmap_check_leaf_extents( xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ xfs_mount_t *mp; /* file system mount structure */ - xfs_bmbt_ptr_t *pp; /* pointer to block address */ + __be64 *pp; /* pointer to block address */ xfs_bmbt_rec_t *ep; /* pointer to current extent */ xfs_bmbt_rec_t *lastp; /* pointer to previous extent */ xfs_bmbt_rec_t *nextp; /* pointer to next extent */ @@ -6231,10 +6231,12 @@ xfs_bmap_check_leaf_extents( ASSERT(level > 0); xfs_check_block(block, mp, 1, ifp->if_broot_bytes); pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); - ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); - bno = INT_GET(*pp, ARCH_CONVERT); + bno = be64_to_cpu(*pp); + + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + /* * Go down the tree until leaf level is reached, following the first * pointer (leftmost) at each level. @@ -6265,8 +6267,8 @@ xfs_bmap_check_leaf_extents( xfs_check_block(block, mp, 0, 0); pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), error0); - bno = INT_GET(*pp, ARCH_CONVERT); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); if (bp_release) { bp_release = 0; xfs_trans_brelse(NULL, bp); @@ -6372,7 +6374,7 @@ xfs_bmap_count_blocks( xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ xfs_mount_t *mp; /* file system mount structure */ - xfs_bmbt_ptr_t *pp; /* pointer to block address */ + __be64 *pp; /* pointer to block address */ bno = NULLFSBLOCK; mp = ip->i_mount; @@ -6395,10 +6397,10 @@ xfs_bmap_count_blocks( level = be16_to_cpu(block->bb_level); ASSERT(level > 0); pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); - ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); - bno = INT_GET(*pp, ARCH_CONVERT); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, @@ -6425,7 +6427,7 @@ xfs_bmap_count_tree( int error; xfs_buf_t *bp, *nbp; int level = levelin; - xfs_bmbt_ptr_t *pp; + __be64 *pp; xfs_fsblock_t bno = blockno; xfs_fsblock_t nextbno; xfs_bmbt_block_t *block, *nextblock; @@ -6452,7 +6454,7 @@ xfs_bmap_count_tree( /* Dive to the next level */ pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); - bno = INT_GET(*pp, ARCH_CONVERT); + bno = be64_to_cpu(*pp); if (unlikely((error = xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { xfs_trans_brelse(tp, bp); diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 18fb7385d719..a7b835bf870a 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -58,7 +58,7 @@ STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *); STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *); STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *, - xfs_bmbt_key_t *, xfs_btree_cur_t **, int *); + __uint64_t *, xfs_btree_cur_t **, int *); STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int); @@ -192,16 +192,11 @@ xfs_bmbt_trace_argifk( xfs_btree_cur_t *cur, int i, xfs_fsblock_t f, - xfs_bmbt_key_t *k, + xfs_dfiloff_t o, int line) { - xfs_dfsbno_t d; - xfs_dfiloff_t o; - - d = (xfs_dfsbno_t)f; - o = INT_GET(k->br_startoff, ARCH_CONVERT); xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line, - i, d >> 32, (int)d, o >> 32, + i, (xfs_dfsbno_t)f >> 32, (int)f, o >> 32, (int)o, 0, 0, 0, 0, 0, 0); } @@ -248,7 +243,7 @@ xfs_bmbt_trace_argik( { xfs_dfiloff_t o; - o = INT_GET(k->br_startoff, ARCH_CONVERT); + o = be64_to_cpu(k->br_startoff); xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line, i, o >> 32, (int)o, 0, 0, 0, 0, 0, @@ -286,8 +281,8 @@ xfs_bmbt_trace_cursor( xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__) #define XFS_BMBT_TRACE_ARGI(c,i) \ xfs_bmbt_trace_argi(fname, c, i, __LINE__) -#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k) \ - xfs_bmbt_trace_argifk(fname, c, i, f, k, __LINE__) +#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) \ + xfs_bmbt_trace_argifk(fname, c, i, f, s, __LINE__) #define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \ xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__) #define XFS_BMBT_TRACE_ARGIK(c,i,k) \ @@ -299,7 +294,7 @@ xfs_bmbt_trace_cursor( #define XFS_BMBT_TRACE_ARGBII(c,b,i,j) #define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) #define XFS_BMBT_TRACE_ARGI(c,i) -#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k) +#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) #define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) #define XFS_BMBT_TRACE_ARGIK(c,i,k) #define XFS_BMBT_TRACE_CURSOR(c,s) @@ -357,7 +352,7 @@ xfs_bmbt_delrec( XFS_BMBT_TRACE_CURSOR(cur, ENTRY); XFS_BMBT_TRACE_ARGI(cur, level); ptr = cur->bc_ptrs[level]; - tcur = (xfs_btree_cur_t *)0; + tcur = NULL; if (ptr == 0) { XFS_BMBT_TRACE_CURSOR(cur, EXIT); *stat = 0; @@ -382,7 +377,7 @@ xfs_bmbt_delrec( pp = XFS_BMAP_PTR_IADDR(block, 1, cur); #ifdef DEBUG for (i = ptr; i < numrecs; i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); goto error0; } @@ -404,7 +399,8 @@ xfs_bmbt_delrec( xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1); } if (ptr == 1) { - INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(rp)); + key.br_startoff = + cpu_to_be64(xfs_bmbt_disk_get_startoff(rp)); kp = &key; } } @@ -621,7 +617,7 @@ xfs_bmbt_delrec( rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); #ifdef DEBUG for (i = 0; i < numrrecs; i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); goto error0; } @@ -748,7 +744,7 @@ xfs_bmbt_insrec( int logflags; /* inode logging flags */ xfs_fsblock_t nbno; /* new block number */ struct xfs_btree_cur *ncur; /* new btree cursor */ - xfs_bmbt_key_t nkey; /* new btree key value */ + __uint64_t startoff; /* new btree key value */ xfs_bmbt_rec_t nrec; /* new record count */ int optr; /* old key/record index */ xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ @@ -759,9 +755,8 @@ xfs_bmbt_insrec( ASSERT(level < cur->bc_nlevels); XFS_BMBT_TRACE_CURSOR(cur, ENTRY); XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp); - ncur = (xfs_btree_cur_t *)0; - INT_SET(key.br_startoff, ARCH_CONVERT, - xfs_bmbt_disk_get_startoff(recp)); + ncur = NULL; + key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(recp)); optr = ptr = cur->bc_ptrs[level]; if (ptr == 0) { XFS_BMBT_TRACE_CURSOR(cur, EXIT); @@ -820,7 +815,7 @@ xfs_bmbt_insrec( optr = ptr = cur->bc_ptrs[level]; } else { if ((error = xfs_bmbt_split(cur, level, - &nbno, &nkey, &ncur, + &nbno, &startoff, &ncur, &i))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); @@ -840,7 +835,7 @@ xfs_bmbt_insrec( #endif ptr = cur->bc_ptrs[level]; xfs_bmbt_disk_set_allf(&nrec, - nkey.br_startoff, 0, 0, + startoff, 0, 0, XFS_EXT_NORM); } else { XFS_BMBT_TRACE_CURSOR(cur, @@ -858,7 +853,7 @@ xfs_bmbt_insrec( pp = XFS_BMAP_PTR_IADDR(block, 1, cur); #ifdef DEBUG for (i = numrecs; i >= ptr; i--) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), + if ((error = xfs_btree_check_lptr_disk(cur, pp[i - 1], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; @@ -870,14 +865,13 @@ xfs_bmbt_insrec( memmove(&pp[ptr], &pp[ptr - 1], /* INT_: direct copy */ (numrecs - ptr + 1) * sizeof(*pp)); #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)*bnop, - level))) { + if ((error = xfs_btree_check_lptr(cur, *bnop, level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } #endif kp[ptr - 1] = key; - INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop); + pp[ptr - 1] = cpu_to_be64(*bnop); numrecs++; block->bb_numrecs = cpu_to_be16(numrecs); xfs_bmbt_log_keys(cur, bp, ptr, numrecs); @@ -988,7 +982,7 @@ xfs_bmbt_killroot( cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur); #ifdef DEBUG for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(cpp[i], ARCH_CONVERT), level - 1))) { + if ((error = xfs_btree_check_lptr_disk(cur, cpp[i], level - 1))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } @@ -1132,7 +1126,7 @@ xfs_bmbt_lookup( d = XFS_FSB_TO_DADDR(mp, fsbno); bp = cur->bc_bufs[level]; if (bp && XFS_BUF_ADDR(bp) != d) - bp = (xfs_buf_t *)0; + bp = NULL; if (!bp) { if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, XFS_BMAP_BTREE_REF))) { @@ -1170,7 +1164,7 @@ xfs_bmbt_lookup( keyno = (low + high) >> 1; if (level > 0) { kkp = kkbase + keyno - 1; - startoff = INT_GET(kkp->br_startoff, ARCH_CONVERT); + startoff = be64_to_cpu(kkp->br_startoff); } else { krp = krbase + keyno - 1; startoff = xfs_bmbt_disk_get_startoff(krp); @@ -1189,13 +1183,13 @@ xfs_bmbt_lookup( if (diff > 0 && --keyno < 1) keyno = 1; pp = XFS_BMAP_PTR_IADDR(block, keyno, cur); + fsbno = be64_to_cpu(*pp); #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr(cur, fsbno, level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } #endif - fsbno = INT_GET(*pp, ARCH_CONVERT); cur->bc_ptrs[level] = keyno; } } @@ -1313,7 +1307,7 @@ xfs_bmbt_lshift( lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur); rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, *rpp, level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } @@ -1340,7 +1334,7 @@ xfs_bmbt_lshift( if (level > 0) { #ifdef DEBUG for (i = 0; i < rrecs; i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT), + if ((error = xfs_btree_check_lptr_disk(cur, rpp[i + 1], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; @@ -1354,8 +1348,7 @@ xfs_bmbt_lshift( } else { memmove(rrp, rrp + 1, rrecs * sizeof(*rrp)); xfs_bmbt_log_recs(cur, rbp, 1, rrecs); - INT_SET(key.br_startoff, ARCH_CONVERT, - xfs_bmbt_disk_get_startoff(rrp)); + key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp)); rkp = &key; } if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) { @@ -1445,7 +1438,7 @@ xfs_bmbt_rshift( rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); #ifdef DEBUG for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } @@ -1454,7 +1447,7 @@ xfs_bmbt_rshift( memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, *lpp, level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } @@ -1469,8 +1462,7 @@ xfs_bmbt_rshift( memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); *rrp = *lrp; xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - INT_SET(key.br_startoff, ARCH_CONVERT, - xfs_bmbt_disk_get_startoff(rrp)); + key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp)); rkp = &key; } be16_add(&left->bb_numrecs, -1); @@ -1535,7 +1527,7 @@ xfs_bmbt_split( xfs_btree_cur_t *cur, int level, xfs_fsblock_t *bnop, - xfs_bmbt_key_t *keyp, + __uint64_t *startoff, xfs_btree_cur_t **curp, int *stat) /* success/failure */ { @@ -1560,7 +1552,7 @@ xfs_bmbt_split( xfs_bmbt_rec_t *rrp; /* right record pointer */ XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, keyp); + XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, *startoff); args.tp = cur->bc_tp; args.mp = cur->bc_mp; lbp = cur->bc_bufs[level]; @@ -1619,7 +1611,7 @@ xfs_bmbt_split( rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); #ifdef DEBUG for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, lpp[i], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } @@ -1629,13 +1621,13 @@ xfs_bmbt_split( memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - keyp->br_startoff = INT_GET(rkp->br_startoff, ARCH_CONVERT); + *startoff = be64_to_cpu(rkp->br_startoff); } else { lrp = XFS_BMAP_REC_IADDR(left, i, cur); rrp = XFS_BMAP_REC_IADDR(right, 1, cur); memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - keyp->br_startoff = xfs_bmbt_disk_get_startoff(rrp); + *startoff = xfs_bmbt_disk_get_startoff(rrp); } be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); right->bb_rightsib = left->bb_rightsib; @@ -1728,9 +1720,9 @@ xfs_bmdr_to_bmbt( { int dmxr; xfs_bmbt_key_t *fkp; - xfs_bmbt_ptr_t *fpp; + __be64 *fpp; xfs_bmbt_key_t *tkp; - xfs_bmbt_ptr_t *tpp; + __be64 *tpp; rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); rblock->bb_level = dblock->bb_level; @@ -1745,7 +1737,7 @@ xfs_bmdr_to_bmbt( tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); - memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */ + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } /* @@ -1805,7 +1797,7 @@ xfs_bmbt_decrement( tp = cur->bc_tp; mp = cur->bc_mp; for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) { - fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur)); if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, XFS_BMAP_BTREE_REF))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); @@ -2135,7 +2127,7 @@ xfs_bmbt_increment( tp = cur->bc_tp; mp = cur->bc_mp; for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) { - fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); + fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur)); if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, XFS_BMAP_BTREE_REF))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); @@ -2178,7 +2170,7 @@ xfs_bmbt_insert( level = 0; nbno = NULLFSBLOCK; xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b); - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; pcur = cur; do { if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur, @@ -2205,7 +2197,7 @@ xfs_bmbt_insert( } if (ncur) { pcur = ncur; - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; } } while (nbno != NULLFSBLOCK); XFS_BMBT_TRACE_CURSOR(cur, EXIT); @@ -2356,12 +2348,12 @@ xfs_bmbt_newroot( args.firstblock = args.fsbno; if (args.fsbno == NULLFSBLOCK) { #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, *pp, level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } #endif - args.fsbno = INT_GET(*pp, ARCH_CONVERT); + args.fsbno = be64_to_cpu(*pp); args.type = XFS_ALLOCTYPE_START_BNO; } else args.type = XFS_ALLOCTYPE_NEAR_BNO; @@ -2393,7 +2385,7 @@ xfs_bmbt_newroot( cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur); #ifdef DEBUG for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) { + if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } @@ -2401,13 +2393,12 @@ xfs_bmbt_newroot( #endif memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp)); #ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)args.fsbno, - level))) { + if ((error = xfs_btree_check_lptr(cur, args.fsbno, level))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; } #endif - INT_SET(*pp, ARCH_CONVERT, args.fsbno); + *pp = cpu_to_be64(args.fsbno); xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs), cur->bc_private.b.whichfork); xfs_btree_setbuf(cur, level, bp); @@ -2681,9 +2672,9 @@ xfs_bmbt_to_bmdr( { int dmxr; xfs_bmbt_key_t *fkp; - xfs_bmbt_ptr_t *fpp; + __be64 *fpp; xfs_bmbt_key_t *tkp; - xfs_bmbt_ptr_t *tpp; + __be64 *tpp; ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC); ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO); @@ -2698,7 +2689,7 @@ xfs_bmbt_to_bmdr( tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); - memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */ + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } /* @@ -2740,7 +2731,7 @@ xfs_bmbt_update( XFS_BMBT_TRACE_CURSOR(cur, EXIT); return 0; } - INT_SET(key.br_startoff, ARCH_CONVERT, off); + key.br_startoff = cpu_to_be64(off); if ((error = xfs_bmbt_updkey(cur, &key, 1))) { XFS_BMBT_TRACE_CURSOR(cur, ERROR); return error; diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 6478cfa0e539..49539de9525b 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -163,13 +163,14 @@ typedef struct xfs_bmbt_irec /* * Key structure for non-leaf levels of the tree. */ -typedef struct xfs_bmbt_key -{ - xfs_dfiloff_t br_startoff; /* starting file offset */ +typedef struct xfs_bmbt_key { + __be64 br_startoff; /* starting file offset */ } xfs_bmbt_key_t, xfs_bmdr_key_t; -typedef xfs_dfsbno_t xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* btree pointer type */ - /* btree block header type */ +/* btree pointer type */ +typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; + +/* btree block header type */ typedef struct xfs_btree_lblock xfs_bmbt_block_t; #define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp)) diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index ee2255bd6562..aeb87ca69fcc 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -161,7 +161,7 @@ xfs_btree_check_key( k1 = ak1; k2 = ak2; - ASSERT(INT_GET(k1->br_startoff, ARCH_CONVERT) < INT_GET(k2->br_startoff, ARCH_CONVERT)); + ASSERT(be64_to_cpu(k1->br_startoff) < be64_to_cpu(k2->br_startoff)); break; } case XFS_BTNUM_INO: { @@ -170,7 +170,7 @@ xfs_btree_check_key( k1 = ak1; k2 = ak2; - ASSERT(INT_GET(k1->ir_startino, ARCH_CONVERT) < INT_GET(k2->ir_startino, ARCH_CONVERT)); + ASSERT(be32_to_cpu(k1->ir_startino) < be32_to_cpu(k2->ir_startino)); break; } default: @@ -285,8 +285,8 @@ xfs_btree_check_rec( r1 = ar1; r2 = ar2; - ASSERT(INT_GET(r1->ir_startino, ARCH_CONVERT) + XFS_INODES_PER_CHUNK <= - INT_GET(r2->ir_startino, ARCH_CONVERT)); + ASSERT(be32_to_cpu(r1->ir_startino) + XFS_INODES_PER_CHUNK <= + be32_to_cpu(r2->ir_startino)); break; } default: diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 44f1bd98064a..892b06c54263 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -145,7 +145,7 @@ typedef struct xfs_btree_cur union { xfs_alloc_rec_incore_t a; xfs_bmbt_irec_t b; - xfs_inobt_rec_t i; + xfs_inobt_rec_incore_t i; } bc_rec; /* current insert/search record value */ struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ @@ -243,6 +243,9 @@ xfs_btree_check_lptr( xfs_dfsbno_t ptr, /* btree block disk address */ int level); /* btree block level */ +#define xfs_btree_check_lptr_disk(cur, ptr, level) \ + xfs_btree_check_lptr(cur, be64_to_cpu(ptr), level) + /* * Checking routine: check that short form block header is ok. */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a4aa53974f76..7a55c248ea70 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -234,7 +234,6 @@ xfs_buf_item_format( ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); bp = bip->bli_buf; - ASSERT(XFS_BUF_BP_ISMAPPED(bp)); vecp = log_vector; /* @@ -628,25 +627,6 @@ xfs_buf_item_committed( } /* - * This is called when the transaction holding the buffer is aborted. - * Just behave as if the transaction had been cancelled. If we're shutting down - * and have aborted this transaction, we'll trap this buffer when it tries to - * get written out. - */ -STATIC void -xfs_buf_item_abort( - xfs_buf_log_item_t *bip) -{ - xfs_buf_t *bp; - - bp = bip->bli_buf; - xfs_buftrace("XFS_ABORT", bp); - XFS_BUF_SUPER_STALE(bp); - xfs_buf_item_unlock(bip); - return; -} - -/* * This is called to asynchronously write the buffer associated with this * buf log item out to disk. The buffer will already have been locked by * a successful call to xfs_buf_item_trylock(). If the buffer still has @@ -693,7 +673,6 @@ STATIC struct xfs_item_ops xfs_buf_item_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_buf_item_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_buf_item_abort, .iop_pushbuf = NULL, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_buf_item_committing @@ -901,7 +880,6 @@ xfs_buf_item_relse( XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list); if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) && (XFS_BUF_IODONE_FUNC(bp) != NULL)) { - ASSERT((XFS_BUF_ISUNINITIAL(bp)) == 0); XFS_BUF_CLR_IODONE_FUNC(bp); } diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 32ab61d17ace..a68bc1f1a313 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1054,7 +1054,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) xfs_da_node_entry_t *btree; xfs_dablk_t blkno; int probe, span, max, error, retval; - xfs_dahash_t hashval; + xfs_dahash_t hashval, btreehashval; xfs_da_args_t *args; args = state->args; @@ -1079,30 +1079,32 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) return(error); } curr = blk->bp->data; - ASSERT(be16_to_cpu(curr->magic) == XFS_DA_NODE_MAGIC || - be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC || - be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC); + blk->magic = be16_to_cpu(curr->magic); + ASSERT(blk->magic == XFS_DA_NODE_MAGIC || + blk->magic == XFS_DIR2_LEAFN_MAGIC || + blk->magic == XFS_ATTR_LEAF_MAGIC); /* * Search an intermediate node for a match. */ - blk->magic = be16_to_cpu(curr->magic); if (blk->magic == XFS_DA_NODE_MAGIC) { node = blk->bp->data; - blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + max = be16_to_cpu(node->hdr.count); + btreehashval = node->btree[max-1].hashval; + blk->hashval = be32_to_cpu(btreehashval); /* * Binary search. (note: small blocks will skip loop) */ - max = be16_to_cpu(node->hdr.count); probe = span = max / 2; hashval = args->hashval; for (btree = &node->btree[probe]; span > 4; btree = &node->btree[probe]) { span /= 2; - if (be32_to_cpu(btree->hashval) < hashval) + btreehashval = be32_to_cpu(btree->hashval); + if (btreehashval < hashval) probe += span; - else if (be32_to_cpu(btree->hashval) > hashval) + else if (btreehashval > hashval) probe -= span; else break; @@ -1133,10 +1135,10 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) blk->index = probe; blkno = be32_to_cpu(btree->before); } - } else if (be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC) { + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); break; - } else if (be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC) { + } else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); break; } @@ -1152,11 +1154,13 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { retval = xfs_dir2_leafn_lookup_int(blk->bp, args, &blk->index, state); - } - else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { retval = xfs_attr_leaf_lookup_int(blk->bp, args); blk->index = args->index; args->blkno = blk->blkno; + } else { + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); } if (((retval == ENOENT) || (retval == ENOATTR)) && (blk->hashval == args->hashval)) { @@ -1166,8 +1170,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) return(error); if (retval == 0) { continue; - } - else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { /* path_shift() gives ENOENT */ retval = XFS_ERROR(ENOATTR); } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index bc43163456ef..0893e16b7d83 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -18,14 +18,6 @@ #ifndef __XFS_ERROR_H__ #define __XFS_ERROR_H__ -#define XFS_ERECOVER 1 /* Failure to recover log */ -#define XFS_ELOGSTAT 2 /* Failure to stat log in user space */ -#define XFS_ENOLOGSPACE 3 /* Reservation too large */ -#define XFS_ENOTSUP 4 /* Operation not supported */ -#define XFS_ENOLSN 5 /* Can't find the lsn you asked for */ -#define XFS_ENOTFOUND 6 -#define XFS_ENOTXFS 7 /* Not XFS filesystem */ - #ifdef DEBUG #define XFS_ERROR_NTRAP 10 extern int xfs_etrap[XFS_ERROR_NTRAP]; @@ -175,6 +167,7 @@ extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud); #define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 #define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 #define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 +#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 struct xfs_mount; /* PRINTFLIKE4 */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6cf6d8769b97..6dba78199faf 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -33,9 +33,6 @@ kmem_zone_t *xfs_efi_zone; kmem_zone_t *xfs_efd_zone; STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *); -STATIC void xfs_efi_item_abort(xfs_efi_log_item_t *); -STATIC void xfs_efd_item_abort(xfs_efd_log_item_t *); - void xfs_efi_item_free(xfs_efi_log_item_t *efip) @@ -184,7 +181,7 @@ STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *efip) { if (efip->efi_item.li_flags & XFS_LI_ABORTED) - xfs_efi_item_abort(efip); + xfs_efi_item_free(efip); return; } @@ -202,18 +199,6 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) } /* - * This is called when the transaction logging the EFI is aborted. - * Free up the EFI and return. No need to clean up the slot for - * the item in the transaction. That was done by the unpin code - * which is called prior to this routine in the abort/fs-shutdown path. - */ -STATIC void -xfs_efi_item_abort(xfs_efi_log_item_t *efip) -{ - xfs_efi_item_free(efip); -} - -/* * There isn't much you can do to push on an efi item. It is simply * stuck waiting for all of its corresponding efd items to be * committed to disk. @@ -255,7 +240,6 @@ STATIC struct xfs_item_ops xfs_efi_item_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_efi_item_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_efi_item_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_efi_item_abort, .iop_pushbuf = NULL, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_efi_item_committing @@ -386,33 +370,6 @@ xfs_efi_release(xfs_efi_log_item_t *efip, } } -/* - * This is called when the transaction that should be committing the - * EFD corresponding to the given EFI is aborted. The committed and - * canceled flags are used to coordinate the freeing of the EFI and - * the references by the transaction that committed it. - */ -STATIC void -xfs_efi_cancel( - xfs_efi_log_item_t *efip) -{ - xfs_mount_t *mp; - SPLDECL(s); - - mp = efip->efi_item.li_mountp; - AIL_LOCK(mp, s); - if (efip->efi_flags & XFS_EFI_COMMITTED) { - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); - xfs_efi_item_free(efip); - } else { - efip->efi_flags |= XFS_EFI_CANCELED; - AIL_UNLOCK(mp, s); - } -} - STATIC void xfs_efd_item_free(xfs_efd_log_item_t *efdp) { @@ -514,7 +471,7 @@ STATIC void xfs_efd_item_unlock(xfs_efd_log_item_t *efdp) { if (efdp->efd_item.li_flags & XFS_LI_ABORTED) - xfs_efd_item_abort(efdp); + xfs_efd_item_free(efdp); return; } @@ -541,27 +498,6 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) } /* - * The transaction of which this EFD is a part has been aborted. - * Inform its companion EFI of this fact and then clean up after - * ourselves. No need to clean up the slot for the item in the - * transaction. That was done by the unpin code which is called - * prior to this routine in the abort/fs-shutdown path. - */ -STATIC void -xfs_efd_item_abort(xfs_efd_log_item_t *efdp) -{ - /* - * If we got a log I/O error, it's always the case that the LR with the - * EFI got unpinned and freed before the EFD got aborted. So don't - * reference the EFI at all in that case. - */ - if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0) - xfs_efi_cancel(efdp->efd_efip); - - xfs_efd_item_free(efdp); -} - -/* * There isn't much you can do to push on an efd item. It is simply * stuck waiting for the log to be flushed to disk. */ @@ -602,7 +538,6 @@ STATIC struct xfs_item_ops xfs_efd_item_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_efd_item_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_efd_item_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_efd_item_abort, .iop_pushbuf = NULL, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_efd_item_committing diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 0ea45edaab03..2f049f63e85f 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -33,14 +33,16 @@ typedef struct xfs_extent { * conversion routine. */ +#ifndef HAVE_FORMAT32 typedef struct xfs_extent_32 { - xfs_dfsbno_t ext_start; - xfs_extlen_t ext_len; + __uint64_t ext_start; + __uint32_t ext_len; } __attribute__((packed)) xfs_extent_32_t; +#endif typedef struct xfs_extent_64 { - xfs_dfsbno_t ext_start; - xfs_extlen_t ext_len; + __uint64_t ext_start; + __uint32_t ext_len; __uint32_t ext_pad; } xfs_extent_64_t; @@ -50,25 +52,27 @@ typedef struct xfs_extent_64 { * size is given by efi_nextents. */ typedef struct xfs_efi_log_format { - unsigned short efi_type; /* efi log item type */ - unsigned short efi_size; /* size of this item */ - uint efi_nextents; /* # extents to free */ + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ __uint64_t efi_id; /* efi identifier */ xfs_extent_t efi_extents[1]; /* array of extents to free */ } xfs_efi_log_format_t; +#ifndef HAVE_FORMAT32 typedef struct xfs_efi_log_format_32 { - unsigned short efi_type; /* efi log item type */ - unsigned short efi_size; /* size of this item */ - uint efi_nextents; /* # extents to free */ + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ __uint64_t efi_id; /* efi identifier */ xfs_extent_32_t efi_extents[1]; /* array of extents to free */ } __attribute__((packed)) xfs_efi_log_format_32_t; +#endif typedef struct xfs_efi_log_format_64 { - unsigned short efi_type; /* efi log item type */ - unsigned short efi_size; /* size of this item */ - uint efi_nextents; /* # extents to free */ + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ __uint64_t efi_id; /* efi identifier */ xfs_extent_64_t efi_extents[1]; /* array of extents to free */ } xfs_efi_log_format_64_t; @@ -79,25 +83,27 @@ typedef struct xfs_efi_log_format_64 { * size is given by efd_nextents; */ typedef struct xfs_efd_log_format { - unsigned short efd_type; /* efd log item type */ - unsigned short efd_size; /* size of this item */ - uint efd_nextents; /* # of extents freed */ + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ __uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_t; +#ifndef HAVE_FORMAT32 typedef struct xfs_efd_log_format_32 { - unsigned short efd_type; /* efd log item type */ - unsigned short efd_size; /* size of this item */ - uint efd_nextents; /* # of extents freed */ + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ __uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_32_t efd_extents[1]; /* array of extents freed */ } __attribute__((packed)) xfs_efd_log_format_32_t; +#endif typedef struct xfs_efd_log_format_64 { - unsigned short efd_type; /* efd log item type */ - unsigned short efd_size; /* size of this item */ - uint efd_nextents; /* # of extents freed */ + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ __uint64_t efd_efi_id; /* id of corresponding efi */ xfs_extent_64_t efd_extents[1]; /* array of extents freed */ } xfs_efd_log_format_64_t; diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 0f0ad1535951..1335449841cd 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -22,8 +22,6 @@ * SGI's XFS filesystem's major stuff (constants, structures) */ -#define XFS_NAME "xfs" - /* * Direct I/O attribute record used with XFS_IOC_DIOINFO * d_miniosz is the min xfer size, xfer size multiple and file seek offset @@ -426,11 +424,7 @@ typedef struct xfs_handle { - (char *) &(handle)) \ + (handle).ha_fid.xfs_fid_len) -#define XFS_HANDLE_CMP(h1, h2) memcmp(h1, h2, sizeof(xfs_handle_t)) - -#define FSHSIZE sizeof(fsid_t) - -/* +/* * Flags for going down operation */ #define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 33164a85aa9d..a446e5a115c6 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -458,7 +458,7 @@ nextag: */ if (XFS_FORCED_SHUTDOWN(mp)) { up_read(&mp->m_peraglock); - return (xfs_buf_t *)0; + return NULL; } agno++; if (agno >= agcount) @@ -466,7 +466,7 @@ nextag: if (agno == pagno) { if (flags == 0) { up_read(&mp->m_peraglock); - return (xfs_buf_t *)0; + return NULL; } flags = 0; } @@ -529,10 +529,10 @@ xfs_dialloc( int offset; /* index of inode in chunk */ xfs_agino_t pagino; /* parent's a.g. relative inode # */ xfs_agnumber_t pagno; /* parent's allocation group number */ - xfs_inobt_rec_t rec; /* inode allocation record */ + xfs_inobt_rec_incore_t rec; /* inode allocation record */ xfs_agnumber_t tagno; /* testing allocation group number */ xfs_btree_cur_t *tcur; /* temp cursor */ - xfs_inobt_rec_t trec; /* temp inode allocation record */ + xfs_inobt_rec_incore_t trec; /* temp inode allocation record */ if (*IO_agbp == NULL) { @@ -945,7 +945,7 @@ xfs_difree( int ilen; /* inodes in an inode cluster */ xfs_mount_t *mp; /* mount structure for filesystem */ int off; /* offset of inode in inode chunk */ - xfs_inobt_rec_t rec; /* btree record */ + xfs_inobt_rec_incore_t rec; /* btree record */ mp = tp->t_mountp; @@ -1195,6 +1195,7 @@ xfs_dilocate( "(0x%llx)", ino, XFS_AGINO_TO_INO(mp, agno, agino)); } + xfs_stack_trace(); #endif /* DEBUG */ return XFS_ERROR(EINVAL); } diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 616eeeb6953e..8cdeeaf8632b 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -568,7 +568,7 @@ xfs_inobt_insrec( /* * Make a key out of the record data to be inserted, and save it. */ - key.ir_startino = recp->ir_startino; /* INT_: direct copy */ + key.ir_startino = recp->ir_startino; optr = ptr = cur->bc_ptrs[level]; /* * If we're off the left edge, return failure. @@ -600,7 +600,7 @@ xfs_inobt_insrec( } #endif nbno = NULLAGBLOCK; - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; /* * If the block is full, we can't insert the new entry until we * make the block un-full. @@ -641,7 +641,7 @@ xfs_inobt_insrec( return error; #endif ptr = cur->bc_ptrs[level]; - nrec.ir_startino = nkey.ir_startino; /* INT_: direct copy */ + nrec.ir_startino = nkey.ir_startino; } else { /* * Otherwise the insert fails. @@ -681,7 +681,7 @@ xfs_inobt_insrec( if ((error = xfs_btree_check_sptr(cur, *bnop, level))) return error; #endif - kp[ptr - 1] = key; /* INT_: struct copy */ + kp[ptr - 1] = key; pp[ptr - 1] = cpu_to_be32(*bnop); numrecs++; block->bb_numrecs = cpu_to_be16(numrecs); @@ -698,7 +698,7 @@ xfs_inobt_insrec( * Now stuff the new record in, bump numrecs * and log the new data. */ - rp[ptr - 1] = *recp; /* INT_: struct copy */ + rp[ptr - 1] = *recp; numrecs++; block->bb_numrecs = cpu_to_be16(numrecs); xfs_inobt_log_recs(cur, bp, ptr, numrecs); @@ -731,7 +731,7 @@ xfs_inobt_insrec( */ *bnop = nbno; if (nbno != NULLAGBLOCK) { - *recp = nrec; /* INT_: struct copy */ + *recp = nrec; *curp = ncur; } *stat = 1; @@ -878,7 +878,7 @@ xfs_inobt_lookup( */ bp = cur->bc_bufs[level]; if (bp && XFS_BUF_ADDR(bp) != d) - bp = (xfs_buf_t *)0; + bp = NULL; if (!bp) { /* * Need to get a new buffer. Read it, then @@ -950,12 +950,12 @@ xfs_inobt_lookup( xfs_inobt_key_t *kkp; kkp = kkbase + keyno - 1; - startino = INT_GET(kkp->ir_startino, ARCH_CONVERT); + startino = be32_to_cpu(kkp->ir_startino); } else { xfs_inobt_rec_t *krp; krp = krbase + keyno - 1; - startino = INT_GET(krp->ir_startino, ARCH_CONVERT); + startino = be32_to_cpu(krp->ir_startino); } /* * Compute difference to get next direction. @@ -1117,7 +1117,7 @@ xfs_inobt_lshift( if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level))) return error; #endif - *lpp = *rpp; /* INT_: no-change copy */ + *lpp = *rpp; xfs_inobt_log_ptrs(cur, lbp, nrec, nrec); } /* @@ -1160,7 +1160,7 @@ xfs_inobt_lshift( } else { memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - key.ir_startino = rrp->ir_startino; /* INT_: direct copy */ + key.ir_startino = rrp->ir_startino; rkp = &key; } /* @@ -1297,13 +1297,13 @@ xfs_inobt_newroot( */ kp = XFS_INOBT_KEY_ADDR(new, 1, cur); if (be16_to_cpu(left->bb_level) > 0) { - kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); /* INT_: struct copy */ - kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); /* INT_: struct copy */ + kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); + kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); } else { rp = XFS_INOBT_REC_ADDR(left, 1, cur); - INT_COPY(kp[0].ir_startino, rp->ir_startino, ARCH_CONVERT); + kp[0].ir_startino = rp->ir_startino; rp = XFS_INOBT_REC_ADDR(right, 1, cur); - INT_COPY(kp[1].ir_startino, rp->ir_startino, ARCH_CONVERT); + kp[1].ir_startino = rp->ir_startino; } xfs_inobt_log_keys(cur, nbp, 1, 2); /* @@ -1410,8 +1410,8 @@ xfs_inobt_rshift( if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level))) return error; #endif - *rkp = *lkp; /* INT_: no change copy */ - *rpp = *lpp; /* INT_: no change copy */ + *rkp = *lkp; + *rpp = *lpp; xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); } else { @@ -1420,7 +1420,7 @@ xfs_inobt_rshift( memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); *rrp = *lrp; xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - key.ir_startino = rrp->ir_startino; /* INT_: direct copy */ + key.ir_startino = rrp->ir_startino; rkp = &key; } /* @@ -1559,7 +1559,7 @@ xfs_inobt_split( rrp = XFS_INOBT_REC_ADDR(right, 1, cur); memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - keyp->ir_startino = rrp->ir_startino; /* INT_: direct copy */ + keyp->ir_startino = rrp->ir_startino; } /* * Find the left block number by looking in the buffer. @@ -1813,9 +1813,9 @@ xfs_inobt_get_rec( * Point to the record and extract its data. */ rec = XFS_INOBT_REC_ADDR(block, ptr, cur); - *ino = INT_GET(rec->ir_startino, ARCH_CONVERT); - *fcnt = INT_GET(rec->ir_freecount, ARCH_CONVERT); - *free = INT_GET(rec->ir_free, ARCH_CONVERT); + *ino = be32_to_cpu(rec->ir_startino); + *fcnt = be32_to_cpu(rec->ir_freecount); + *free = be64_to_cpu(rec->ir_free); *stat = 1; return 0; } @@ -1930,10 +1930,10 @@ xfs_inobt_insert( level = 0; nbno = NULLAGBLOCK; - INT_SET(nrec.ir_startino, ARCH_CONVERT, cur->bc_rec.i.ir_startino); - INT_SET(nrec.ir_freecount, ARCH_CONVERT, cur->bc_rec.i.ir_freecount); - INT_SET(nrec.ir_free, ARCH_CONVERT, cur->bc_rec.i.ir_free); - ncur = (xfs_btree_cur_t *)0; + nrec.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); + nrec.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + nrec.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); + ncur = NULL; pcur = cur; /* * Loop going up the tree, starting at the leaf level. @@ -1965,7 +1965,7 @@ xfs_inobt_insert( */ if (ncur) { pcur = ncur; - ncur = (xfs_btree_cur_t *)0; + ncur = NULL; } } while (nbno != NULLAGBLOCK); *stat = i; @@ -2060,9 +2060,9 @@ xfs_inobt_update( /* * Fill in the new contents and log them. */ - INT_SET(rp->ir_startino, ARCH_CONVERT, ino); - INT_SET(rp->ir_freecount, ARCH_CONVERT, fcnt); - INT_SET(rp->ir_free, ARCH_CONVERT, free); + rp->ir_startino = cpu_to_be32(ino); + rp->ir_freecount = cpu_to_be32(fcnt); + rp->ir_free = cpu_to_be64(free); xfs_inobt_log_recs(cur, bp, ptr, ptr); /* * Updating first record in leaf. Pass new key value up to our parent. @@ -2070,7 +2070,7 @@ xfs_inobt_update( if (ptr == 1) { xfs_inobt_key_t key; /* key containing [ino] */ - INT_SET(key.ir_startino, ARCH_CONVERT, ino); + key.ir_startino = cpu_to_be32(ino); if ((error = xfs_inobt_updkey(cur, &key, 1))) return error; } diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index ae3904cb1ee8..2c0e49893ff7 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -47,19 +47,24 @@ static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) /* * Data record structure */ -typedef struct xfs_inobt_rec -{ +typedef struct xfs_inobt_rec { + __be32 ir_startino; /* starting inode number */ + __be32 ir_freecount; /* count of free inodes (set bits) */ + __be64 ir_free; /* free inode mask */ +} xfs_inobt_rec_t; + +typedef struct xfs_inobt_rec_incore { xfs_agino_t ir_startino; /* starting inode number */ __int32_t ir_freecount; /* count of free inodes (set bits) */ xfs_inofree_t ir_free; /* free inode mask */ -} xfs_inobt_rec_t; +} xfs_inobt_rec_incore_t; + /* * Key structure */ -typedef struct xfs_inobt_key -{ - xfs_agino_t ir_startino; /* starting inode number */ +typedef struct xfs_inobt_key { + __be32 ir_startino; /* starting inode number */ } xfs_inobt_key_t; /* btree pointer type */ @@ -77,7 +82,7 @@ typedef struct xfs_btree_sblock xfs_inobt_block_t; #define XFS_INOBT_IS_FREE(rp,i) \ (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0) #define XFS_INOBT_IS_FREE_DISK(rp,i) \ - ((INT_GET((rp)->ir_free,ARCH_CONVERT) & XFS_INOBT_MASK(i)) != 0) + ((be64_to_cpu((rp)->ir_free) & XFS_INOBT_MASK(i)) != 0) #define XFS_INOBT_SET_FREE(rp,i) ((rp)->ir_free |= XFS_INOBT_MASK(i)) #define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i)) diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 0724df7fabb7..b73d216ecaf9 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -50,7 +50,7 @@ void xfs_ihash_init(xfs_mount_t *mp) { __uint64_t icount; - uint i, flags = KM_SLEEP | KM_MAYFAIL; + uint i; if (!mp->m_ihsize) { icount = mp->m_maxicount ? mp->m_maxicount : @@ -61,14 +61,13 @@ xfs_ihash_init(xfs_mount_t *mp) (64 * NBPP) / sizeof(xfs_ihash_t)); } - while (!(mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize * - sizeof(xfs_ihash_t), flags))) { - if ((mp->m_ihsize >>= 1) <= NBPP) - flags = KM_SLEEP; - } - for (i = 0; i < mp->m_ihsize; i++) { + mp->m_ihash = kmem_zalloc_greedy(&mp->m_ihsize, + NBPC * sizeof(xfs_ihash_t), + mp->m_ihsize * sizeof(xfs_ihash_t), + KM_SLEEP | KM_MAYFAIL | KM_LARGE); + mp->m_ihsize /= sizeof(xfs_ihash_t); + for (i = 0; i < mp->m_ihsize; i++) rwlock_init(&(mp->m_ihash[i].ih_lock)); - } } /* @@ -77,7 +76,7 @@ xfs_ihash_init(xfs_mount_t *mp) void xfs_ihash_free(xfs_mount_t *mp) { - kmem_free(mp->m_ihash, mp->m_ihsize*sizeof(xfs_ihash_t)); + kmem_free(mp->m_ihash, mp->m_ihsize * sizeof(xfs_ihash_t)); mp->m_ihash = NULL; } @@ -95,7 +94,7 @@ xfs_chash_init(xfs_mount_t *mp) mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize); mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize * sizeof(xfs_chash_t), - KM_SLEEP); + KM_SLEEP | KM_LARGE); for (i = 0; i < mp->m_chsize; i++) { spinlock_init(&mp->m_chash[i].ch_lock,"xfshash"); } @@ -244,7 +243,9 @@ again: XFS_STATS_INC(xs_ig_found); + spin_lock(&ip->i_flags_lock); ip->i_flags &= ~XFS_IRECLAIMABLE; + spin_unlock(&ip->i_flags_lock); version = ih->ih_version; read_unlock(&ih->ih_lock); xfs_ihash_promote(ih, ip, version); @@ -290,15 +291,17 @@ again: finish_inode: if (ip->i_d.di_mode == 0) { - if (!(flags & IGET_CREATE)) + if (!(flags & XFS_IGET_CREATE)) return ENOENT; xfs_iocore_inode_reinit(ip); } - + if (lock_flags != 0) xfs_ilock(ip, lock_flags); + spin_lock(&ip->i_flags_lock); ip->i_flags &= ~XFS_ISTALE; + spin_unlock(&ip->i_flags_lock); vn_trace_exit(vp, "xfs_iget.found", (inst_t *)__return_address); @@ -320,21 +323,20 @@ finish_inode: * Read the disk inode attributes into a new inode structure and get * a new vnode for it. This should also initialize i_ino and i_mount. */ - error = xfs_iread(mp, tp, ino, &ip, bno); - if (error) { + error = xfs_iread(mp, tp, ino, &ip, bno, + (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0); + if (error) return error; - } vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address); xfs_inode_lock_init(ip, vp); xfs_iocore_inode_init(ip); - if (lock_flags != 0) { + if (lock_flags) xfs_ilock(ip, lock_flags); - } - - if ((ip->i_d.di_mode == 0) && !(flags & IGET_CREATE)) { + + if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { xfs_idestroy(ip); return ENOENT; } @@ -369,7 +371,9 @@ finish_inode: ih->ih_next = ip; ip->i_udquot = ip->i_gdquot = NULL; ih->ih_version++; + spin_lock(&ip->i_flags_lock); ip->i_flags |= XFS_INEW; + spin_unlock(&ip->i_flags_lock); write_unlock(&ih->ih_lock); @@ -548,7 +552,7 @@ xfs_inode_lock_init( mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", vp->v_number); init_waitqueue_head(&ip->i_ipin_wait); atomic_set(&ip->i_pincount, 0); - init_sema(&ip->i_flock, 1, "xfsfino", vp->v_number); + initnsema(&ip->i_flock, 1, "xfsfino"); } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1f8ecff8553a..c27d7d495aa0 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -854,7 +854,8 @@ xfs_iread( xfs_trans_t *tp, xfs_ino_t ino, xfs_inode_t **ipp, - xfs_daddr_t bno) + xfs_daddr_t bno, + uint imap_flags) { xfs_buf_t *bp; xfs_dinode_t *dip; @@ -866,6 +867,7 @@ xfs_iread( ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP); ip->i_ino = ino; ip->i_mount = mp; + spin_lock_init(&ip->i_flags_lock); /* * Get pointer's to the on-disk inode and the buffer containing it. @@ -874,7 +876,7 @@ xfs_iread( * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will * know that this is a new incore inode. */ - error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, 0); + error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags); if (error) { kmem_zone_free(xfs_inode_zone, ip); return error; @@ -1113,7 +1115,7 @@ xfs_ialloc( * to prevent others from looking at until we're done. */ error = xfs_trans_iget(tp->t_mountp, tp, ino, - IGET_CREATE, XFS_ILOCK_EXCL, &ip); + XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); if (error != 0) { return error; } @@ -2213,7 +2215,9 @@ xfs_ifree_cluster( if (ip == free_ip) { if (xfs_iflock_nowait(ip)) { + spin_lock(&ip->i_flags_lock); ip->i_flags |= XFS_ISTALE; + spin_unlock(&ip->i_flags_lock); if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); @@ -2227,7 +2231,9 @@ xfs_ifree_cluster( if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { if (xfs_iflock_nowait(ip)) { + spin_lock(&ip->i_flags_lock); ip->i_flags |= XFS_ISTALE; + spin_unlock(&ip->i_flags_lock); if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); @@ -2257,7 +2263,9 @@ xfs_ifree_cluster( AIL_LOCK(mp,s); iip->ili_flush_lsn = iip->ili_item.li_lsn; AIL_UNLOCK(mp, s); + spin_lock(&iip->ili_inode->i_flags_lock); iip->ili_inode->i_flags |= XFS_ISTALE; + spin_unlock(&iip->ili_inode->i_flags_lock); pre_flushed++; } lip = lip->li_bio_list; @@ -2753,19 +2761,29 @@ xfs_iunpin( * call as the inode reclaim may be blocked waiting for * the inode to become unpinned. */ + struct inode *inode = NULL; + + spin_lock(&ip->i_flags_lock); if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) { bhv_vnode_t *vp = XFS_ITOV_NULL(ip); /* make sync come back and flush this inode */ if (vp) { - struct inode *inode = vn_to_inode(vp); + inode = vn_to_inode(vp); if (!(inode->i_state & - (I_NEW|I_FREEING|I_CLEAR))) - mark_inode_dirty_sync(inode); + (I_NEW|I_FREEING|I_CLEAR))) { + inode = igrab(inode); + if (inode) + mark_inode_dirty_sync(inode); + } else + inode = NULL; } } + spin_unlock(&ip->i_flags_lock); wake_up(&ip->i_ipin_wait); + if (inode) + iput(inode); } } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index d10b76ed1e5b..e96eb0835fe6 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -267,6 +267,7 @@ typedef struct xfs_inode { sema_t i_flock; /* inode flush lock */ atomic_t i_pincount; /* inode pin count */ wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ + spinlock_t i_flags_lock; /* inode i_flags lock */ #ifdef HAVE_REFCACHE struct xfs_inode **i_refcache; /* ptr to entry in ref cache */ struct xfs_inode *i_release; /* inode to unref */ @@ -389,11 +390,14 @@ typedef struct xfs_inode { (((vfsp)->vfs_flag & VFS_GRPID) || ((pip)->i_d.di_mode & S_ISGID)) /* - * xfs_iget.c prototypes. + * Flags for xfs_iget() */ +#define XFS_IGET_CREATE 0x1 +#define XFS_IGET_BULKSTAT 0x2 -#define IGET_CREATE 1 - +/* + * xfs_iget.c prototypes. + */ void xfs_ihash_init(struct xfs_mount *); void xfs_ihash_free(struct xfs_mount *); void xfs_chash_init(struct xfs_mount *); @@ -425,7 +429,7 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *, xfs_inode_t *, xfs_dinode_t **, struct xfs_buf **, xfs_daddr_t, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - xfs_inode_t **, xfs_daddr_t); + xfs_inode_t **, xfs_daddr_t, uint); int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int); int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, xfs_nlink_t, xfs_dev_t, struct cred *, xfs_prid_t, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f8e80d8e7237..a7a92251eb56 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -743,21 +743,6 @@ xfs_inode_item_committed( } /* - * The transaction with the inode locked has aborted. The inode - * must not be dirty within the transaction (unless we're forcibly - * shutting down). We simply unlock just as if the transaction - * had been cancelled. - */ -STATIC void -xfs_inode_item_abort( - xfs_inode_log_item_t *iip) -{ - xfs_inode_item_unlock(iip); - return; -} - - -/* * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK * failed to get the inode flush lock but did get the inode locked SHARED. * Here we're trying to see if the inode buffer is incore, and if so whether it's @@ -915,7 +900,6 @@ STATIC struct xfs_item_ops xfs_inode_item_ops = { .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_inode_item_committed, .iop_push = (void(*)(xfs_log_item_t*))xfs_inode_item_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_inode_item_abort, .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf, .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) xfs_inode_item_committing diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 5db6cd1b4cf3..bfe92ea17952 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -25,52 +25,54 @@ * must be added on to the end. */ typedef struct xfs_inode_log_format { - unsigned short ilf_type; /* inode log item type */ - unsigned short ilf_size; /* size of this item */ - uint ilf_fields; /* flags for fields logged */ - ushort ilf_asize; /* size of attr d/ext/root */ - ushort ilf_dsize; /* size of data/ext/root */ - xfs_ino_t ilf_ino; /* inode number */ + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ union { - xfs_dev_t ilfu_rdev; /* rdev value for dev inode*/ + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ uuid_t ilfu_uuid; /* mount point value */ } ilf_u; __int64_t ilf_blkno; /* blkno of inode buffer */ - int ilf_len; /* len of inode buffer */ - int ilf_boffset; /* off of inode in buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_t; +#ifndef HAVE_FORMAT32 typedef struct xfs_inode_log_format_32 { - unsigned short ilf_type; /* 16: inode log item type */ - unsigned short ilf_size; /* 16: size of this item */ - uint ilf_fields; /* 32: flags for fields logged */ - ushort ilf_asize; /* 32: size of attr d/ext/root */ - ushort ilf_dsize; /* 32: size of data/ext/root */ - xfs_ino_t ilf_ino; /* 64: inode number */ + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ union { - xfs_dev_t ilfu_rdev; /* 32: rdev value for dev inode*/ - uuid_t ilfu_uuid; /* 128: mount point value */ + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ } ilf_u; - __int64_t ilf_blkno; /* 64: blkno of inode buffer */ - int ilf_len; /* 32: len of inode buffer */ - int ilf_boffset; /* 32: off of inode in buffer */ + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ } __attribute__((packed)) xfs_inode_log_format_32_t; +#endif typedef struct xfs_inode_log_format_64 { - unsigned short ilf_type; /* 16: inode log item type */ - unsigned short ilf_size; /* 16: size of this item */ - uint ilf_fields; /* 32: flags for fields logged */ - ushort ilf_asize; /* 32: size of attr d/ext/root */ - ushort ilf_dsize; /* 32: size of data/ext/root */ - __uint32_t ilf_pad; /* 32: pad for 64 bit boundary */ - xfs_ino_t ilf_ino; /* 64: inode number */ + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint32_t ilf_pad; /* pad for 64 bit boundary */ + __uint64_t ilf_ino; /* inode number */ union { - xfs_dev_t ilfu_rdev; /* 32: rdev value for dev inode*/ - uuid_t ilfu_uuid; /* 128: mount point value */ + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ } ilf_u; - __int64_t ilf_blkno; /* 64: blkno of inode buffer */ - int ilf_len; /* 32: len of inode buffer */ - int ilf_boffset; /* 32: off of inode in buffer */ + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ } xfs_inode_log_format_64_t; /* diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f1949c16df15..19655124da78 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -398,6 +398,23 @@ xfs_flush_space( return 1; } +STATIC int +xfs_cmn_err_fsblock_zero( + xfs_inode_t *ip, + xfs_bmbt_irec_t *imap) +{ + xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x\n", + (unsigned long long)ip->i_ino, + (unsigned long long)imap->br_startblock, + (unsigned long long)imap->br_startoff, + (unsigned long long)imap->br_blockcount, + imap->br_state); + return EFSCORRUPTED; +} + int xfs_iomap_write_direct( xfs_inode_t *ip, @@ -536,23 +553,17 @@ xfs_iomap_write_direct( * Copy any maps to caller's array and return any error. */ if (nimaps == 0) { - error = (ENOSPC); + error = ENOSPC; + goto error_out; + } + + if (unlikely(!imap.br_startblock && !(io->io_flags & XFS_IOCORE_RT))) { + error = xfs_cmn_err_fsblock_zero(ip, &imap); goto error_out; } *ret_imap = imap; *nmaps = 1; - if ( !(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) { - cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld " - "start_block : %llx start_off : %llx blkcnt : %llx " - "extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - (unsigned long long)ret_imap->br_startblock, - (unsigned long long)ret_imap->br_startoff, - (unsigned long long)ret_imap->br_blockcount, - ret_imap->br_state); - } return 0; error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ @@ -715,17 +726,8 @@ retry: goto retry; } - if (!(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) { - cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld " - "start_block : %llx start_off : %llx blkcnt : %llx " - "extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - (unsigned long long)ret_imap->br_startblock, - (unsigned long long)ret_imap->br_startoff, - (unsigned long long)ret_imap->br_blockcount, - ret_imap->br_state); - } + if (unlikely(!imap[0].br_startblock && !(io->io_flags & XFS_IOCORE_RT))) + return xfs_cmn_err_fsblock_zero(ip, &imap[0]); *ret_imap = imap[0]; *nmaps = 1; @@ -853,24 +855,10 @@ xfs_iomap_write_allocate( * See if we were able to allocate an extent that * covers at least part of the callers request */ - for (i = 0; i < nimaps; i++) { - if (!(io->io_flags & XFS_IOCORE_RT) && - !imap[i].br_startblock) { - cmn_err(CE_PANIC,"Access to block zero: " - "fs <%s> inode: %lld " - "start_block : %llx start_off : %llx " - "blkcnt : %llx extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - (unsigned long long) - imap[i].br_startblock, - (unsigned long long) - imap[i].br_startoff, - (unsigned long long) - imap[i].br_blockcount, - imap[i].br_state); - } + if (unlikely(!imap[i].br_startblock && + !(io->io_flags & XFS_IOCORE_RT))) + return xfs_cmn_err_fsblock_zero(ip, &imap[i]); if ((offset_fsb >= imap[i].br_startoff) && (offset_fsb < (imap[i].br_startoff + imap[i].br_blockcount))) { @@ -941,7 +929,7 @@ xfs_iomap_write_unwritten( XFS_WRITE_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); - goto error0; + return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -967,19 +955,11 @@ xfs_iomap_write_unwritten( error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) - goto error0; - - if ( !(io->io_flags & XFS_IOCORE_RT) && !imap.br_startblock) { - cmn_err(CE_PANIC,"Access to block zero: fs <%s> " - "inode: %lld start_block : %llx start_off : " - "%llx blkcnt : %llx extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - (unsigned long long)imap.br_startblock, - (unsigned long long)imap.br_startoff, - (unsigned long long)imap.br_blockcount, - imap.br_state); - } + return XFS_ERROR(error); + + if (unlikely(!imap.br_startblock && + !(io->io_flags & XFS_IOCORE_RT))) + return xfs_cmn_err_fsblock_zero(ip, &imap); if ((numblks_fsb = imap.br_blockcount) == 0) { /* @@ -999,6 +979,5 @@ error_on_bmapi_transaction: xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: return XFS_ERROR(error); } diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 46249e4d1fea..7775ddc0b3c6 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -39,6 +39,16 @@ #include "xfs_error.h" #include "xfs_btree.h" +int +xfs_internal_inum( + xfs_mount_t *mp, + xfs_ino_t ino) +{ + return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || + (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) && + (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))); +} + STATIC int xfs_bulkstat_one_iget( xfs_mount_t *mp, /* mount point for filesystem */ @@ -52,7 +62,8 @@ xfs_bulkstat_one_iget( bhv_vnode_t *vp; int error; - error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno); + error = xfs_iget(mp, NULL, ino, + XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno); if (error) { *stat = BULKSTAT_RV_NOTHING; return error; @@ -212,17 +223,12 @@ xfs_bulkstat_one( xfs_dinode_t *dip; /* dinode inode pointer */ dip = (xfs_dinode_t *)dibuff; + *stat = BULKSTAT_RV_NOTHING; - if (!buffer || ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || - (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) && - (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))) { - *stat = BULKSTAT_RV_NOTHING; + if (!buffer || xfs_internal_inum(mp, ino)) return XFS_ERROR(EINVAL); - } - if (ubsize < sizeof(*buf)) { - *stat = BULKSTAT_RV_NOTHING; + if (ubsize < sizeof(*buf)) return XFS_ERROR(ENOMEM); - } buf = kmem_alloc(sizeof(*buf), KM_SLEEP); @@ -238,8 +244,7 @@ xfs_bulkstat_one( } if (copy_to_user(buffer, buf, sizeof(*buf))) { - *stat = BULKSTAT_RV_NOTHING; - error = EFAULT; + error = EFAULT; goto out_free; } @@ -253,6 +258,46 @@ xfs_bulkstat_one( } /* + * Test to see whether we can use the ondisk inode directly, based + * on the given bulkstat flags, filling in dipp accordingly. + * Returns zero if the inode is dodgey. + */ +STATIC int +xfs_bulkstat_use_dinode( + xfs_mount_t *mp, + int flags, + xfs_buf_t *bp, + int clustidx, + xfs_dinode_t **dipp) +{ + xfs_dinode_t *dip; + unsigned int aformat; + + *dipp = NULL; + if (!bp || (flags & BULKSTAT_FG_IGET)) + return 1; + dip = (xfs_dinode_t *) + xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog); + if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC || + !XFS_DINODE_GOOD_VERSION( + INT_GET(dip->di_core.di_version, ARCH_CONVERT))) + return 0; + if (flags & BULKSTAT_FG_QUICK) { + *dipp = dip; + return 1; + } + /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */ + aformat = INT_GET(dip->di_core.di_aformat, ARCH_CONVERT); + if ((XFS_CFORK_Q(&dip->di_core) == 0) || + (aformat == XFS_DINODE_FMT_LOCAL) || + (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_core.di_anextents)) { + *dipp = dip; + return 1; + } + return 1; +} + +/* * Return stat information in bulk (by-inode) for the filesystem. */ int /* error status */ @@ -284,10 +329,11 @@ xfs_bulkstat( xfs_agino_t gino; /* current btree rec's start inode */ int i; /* loop index */ int icount; /* count of inodes good in irbuf */ + size_t irbsize; /* size of irec buffer in bytes */ xfs_ino_t ino; /* inode number (filesystem) */ - xfs_inobt_rec_t *irbp; /* current irec buffer pointer */ - xfs_inobt_rec_t *irbuf; /* start of irec buffer */ - xfs_inobt_rec_t *irbufend; /* end of good irec buffer entries */ + xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */ + xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ + xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ xfs_ino_t lastino=0; /* last inode number returned */ int nbcluster; /* # of blocks in a cluster */ int nicluster; /* # of inodes in a cluster */ @@ -328,13 +374,10 @@ xfs_bulkstat( (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); nimask = ~(nicluster - 1); nbcluster = nicluster >> mp->m_sb.sb_inopblog; - /* - * Allocate a page-sized buffer for inode btree records. - * We could try allocating something smaller, but for normal - * calls we'll always (potentially) need the whole page. - */ - irbuf = kmem_alloc(NBPC, KM_SLEEP); - nirbuf = NBPC / sizeof(*irbuf); + irbuf = kmem_zalloc_greedy(&irbsize, NBPC, NBPC * 4, + KM_SLEEP | KM_MAYFAIL | KM_LARGE); + nirbuf = irbsize / sizeof(*irbuf); + /* * Loop over the allocation groups, starting from the last * inode returned; 0 means start of the allocation group. @@ -358,7 +401,7 @@ xfs_bulkstat( * Allocate and initialize a btree cursor for ialloc btree. */ cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO, - (xfs_inode_t *)0, 0); + (xfs_inode_t *)0, 0); irbp = irbuf; irbufend = irbuf + nirbuf; end_of_ag = 0; @@ -395,9 +438,9 @@ xfs_bulkstat( gcnt++; } gfree |= XFS_INOBT_MASKN(0, chunkidx); - INT_SET(irbp->ir_startino, ARCH_CONVERT, gino); - INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt); - INT_SET(irbp->ir_free, ARCH_CONVERT, gfree); + irbp->ir_startino = gino; + irbp->ir_freecount = gcnt; + irbp->ir_free = gfree; irbp++; agino = gino + XFS_INODES_PER_CHUNK; icount = XFS_INODES_PER_CHUNK - gcnt; @@ -451,11 +494,27 @@ xfs_bulkstat( } /* * If this chunk has any allocated inodes, save it. + * Also start read-ahead now for this chunk. */ if (gcnt < XFS_INODES_PER_CHUNK) { - INT_SET(irbp->ir_startino, ARCH_CONVERT, gino); - INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt); - INT_SET(irbp->ir_free, ARCH_CONVERT, gfree); + /* + * Loop over all clusters in the next chunk. + * Do a readahead if there are any allocated + * inodes in that cluster. + */ + for (agbno = XFS_AGINO_TO_AGBNO(mp, gino), + chunkidx = 0; + chunkidx < XFS_INODES_PER_CHUNK; + chunkidx += nicluster, + agbno += nbcluster) { + if (XFS_INOBT_MASKN(chunkidx, + nicluster) & ~gfree) + xfs_btree_reada_bufs(mp, agno, + agbno, nbcluster); + } + irbp->ir_startino = gino; + irbp->ir_freecount = gcnt; + irbp->ir_free = gfree; irbp++; icount += XFS_INODES_PER_CHUNK - gcnt; } @@ -479,33 +538,11 @@ xfs_bulkstat( for (irbp = irbuf; irbp < irbufend && ubleft >= statstruct_size; irbp++) { /* - * Read-ahead the next chunk's worth of inodes. - */ - if (&irbp[1] < irbufend) { - /* - * Loop over all clusters in the next chunk. - * Do a readahead if there are any allocated - * inodes in that cluster. - */ - for (agbno = XFS_AGINO_TO_AGBNO(mp, - INT_GET(irbp[1].ir_startino, ARCH_CONVERT)), - chunkidx = 0; - chunkidx < XFS_INODES_PER_CHUNK; - chunkidx += nicluster, - agbno += nbcluster) { - if (XFS_INOBT_MASKN(chunkidx, - nicluster) & - ~(INT_GET(irbp[1].ir_free, ARCH_CONVERT))) - xfs_btree_reada_bufs(mp, agno, - agbno, nbcluster); - } - } - /* * Now process this chunk of inodes. */ - for (agino = INT_GET(irbp->ir_startino, ARCH_CONVERT), chunkidx = 0, clustidx = 0; + for (agino = irbp->ir_startino, chunkidx = clustidx = 0; ubleft > 0 && - INT_GET(irbp->ir_freecount, ARCH_CONVERT) < XFS_INODES_PER_CHUNK; + irbp->ir_freecount < XFS_INODES_PER_CHUNK; chunkidx++, clustidx++, agino++) { ASSERT(chunkidx < XFS_INODES_PER_CHUNK); /* @@ -525,11 +562,12 @@ xfs_bulkstat( */ if ((chunkidx & (nicluster - 1)) == 0) { agbno = XFS_AGINO_TO_AGBNO(mp, - INT_GET(irbp->ir_startino, ARCH_CONVERT)) + + irbp->ir_startino) + ((chunkidx & nimask) >> mp->m_sb.sb_inopblog); - if (flags & BULKSTAT_FG_QUICK) { + if (flags & (BULKSTAT_FG_QUICK | + BULKSTAT_FG_INLINE)) { ino = XFS_AGINO_TO_INO(mp, agno, agino); bno = XFS_AGB_TO_DADDR(mp, agno, @@ -543,6 +581,7 @@ xfs_bulkstat( KM_SLEEP); ip->i_ino = ino; ip->i_mount = mp; + spin_lock_init(&ip->i_flags_lock); if (bp) xfs_buf_relse(bp); error = xfs_itobp(mp, NULL, ip, @@ -564,30 +603,34 @@ xfs_bulkstat( /* * Skip if this inode is free. */ - if (XFS_INOBT_MASK(chunkidx) & INT_GET(irbp->ir_free, ARCH_CONVERT)) + if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) continue; /* * Count used inodes as free so we can tell * when the chunk is used up. */ - INT_MOD(irbp->ir_freecount, ARCH_CONVERT, +1); + irbp->ir_freecount++; ino = XFS_AGINO_TO_INO(mp, agno, agino); bno = XFS_AGB_TO_DADDR(mp, agno, agbno); - if (flags & BULKSTAT_FG_QUICK) { - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - (clustidx << mp->m_sb.sb_inodelog)); - - if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) - != XFS_DINODE_MAGIC - || !XFS_DINODE_GOOD_VERSION( - INT_GET(dip->di_core.di_version, ARCH_CONVERT))) - continue; + if (!xfs_bulkstat_use_dinode(mp, flags, bp, + clustidx, &dip)) + continue; + /* + * If we need to do an iget, cannot hold bp. + * Drop it, until starting the next cluster. + */ + if ((flags & BULKSTAT_FG_INLINE) && !dip) { + if (bp) + xfs_buf_relse(bp); + bp = NULL; } /* * Get the inode and fill in a single buffer. * BULKSTAT_FG_QUICK uses dip to fill it in. * BULKSTAT_FG_IGET uses igets. + * BULKSTAT_FG_INLINE uses dip if we have an + * inline attr fork, else igets. * See: xfs_bulkstat_one & xfs_dm_bulkstat_one. * This is also used to count inodes/blks, etc * in xfs_qm_quotacheck. @@ -597,8 +640,15 @@ xfs_bulkstat( ubleft, private_data, bno, &ubused, dip, &fmterror); if (fmterror == BULKSTAT_RV_NOTHING) { - if (error == ENOMEM) + if (error == EFAULT) { + ubleft = 0; + rval = error; + break; + } + else if (error == ENOMEM) ubleft = 0; + else + lastino = ino; continue; } if (fmterror == BULKSTAT_RV_GIVEUP) { @@ -633,7 +683,7 @@ xfs_bulkstat( /* * Done, we're either out of filesystem or space to put the data. */ - kmem_free(irbuf, NBPC); + kmem_free(irbuf, irbsize); *ubcountp = ubelem; if (agno >= mp->m_sb.sb_agcount) { /* diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index be5f12e07d22..f25a28862a17 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -36,15 +36,16 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, /* * Values for stat return value. */ -#define BULKSTAT_RV_NOTHING 0 -#define BULKSTAT_RV_DIDONE 1 -#define BULKSTAT_RV_GIVEUP 2 +#define BULKSTAT_RV_NOTHING 0 +#define BULKSTAT_RV_DIDONE 1 +#define BULKSTAT_RV_GIVEUP 2 /* * Values for bulkstat flag argument. */ -#define BULKSTAT_FG_IGET 0x1 /* Go through the buffer cache */ -#define BULKSTAT_FG_QUICK 0x2 /* No iget, walk the dinode cluster */ +#define BULKSTAT_FG_IGET 0x1 /* Go through the buffer cache */ +#define BULKSTAT_FG_QUICK 0x2 /* No iget, walk the dinode cluster */ +#define BULKSTAT_FG_INLINE 0x4 /* No iget if inline attrs */ /* * Return stat information in bulk (by-inode) for the filesystem. @@ -80,6 +81,11 @@ xfs_bulkstat_one( void *dibuff, int *stat); +int +xfs_internal_inum( + xfs_mount_t *mp, + xfs_ino_t ino); + int /* error status */ xfs_inumbers( xfs_mount_t *mp, /* mount point for filesystem */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 21ac1a67e3e0..c48bf61f17bd 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -617,7 +617,8 @@ xfs_log_unmount_write(xfs_mount_t *mp) reg[0].i_len = sizeof(magic); XLOG_VEC_SET_TYPE(®[0], XLOG_REG_TYPE_UNMOUNT); - error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, 0); + error = xfs_log_reserve(mp, 600, 1, &tic, + XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); if (!error) { /* remove inited flag */ ((xlog_ticket_t *)tic)->t_flags = 0; @@ -655,8 +656,11 @@ xfs_log_unmount_write(xfs_mount_t *mp) } else { LOG_UNLOCK(log, s); } - if (tic) + if (tic) { + xlog_trace_loggrant(log, tic, "unmount rec"); + xlog_ungrant_log_space(log, tic); xlog_state_put_ticket(log, tic); + } } else { /* * We're already in forced_shutdown mode, couldn't @@ -1196,7 +1200,7 @@ xlog_alloc_log(xfs_mount_t *mp, kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP); iclog = *iclogp; iclog->hic_data = (xlog_in_core_2_t *) - kmem_zalloc(iclogsize, KM_SLEEP); + kmem_zalloc(iclogsize, KM_SLEEP | KM_LARGE); iclog->ic_prev = prev_iclog; prev_iclog = iclog; @@ -2212,9 +2216,13 @@ xlog_state_do_callback( iclog = iclog->ic_next; } while (first_iclog != iclog); - if (repeats && (repeats % 10) == 0) { + + if (repeats > 5000) { + flushcnt += repeats; + repeats = 0; xfs_fs_cmn_err(CE_WARN, log->l_mp, - "xlog_state_do_callback: looping %d", repeats); + "%s: possible infinite loop (%d iterations)", + __FUNCTION__, flushcnt); } } while (!ioerrors && loopdidcallbacks); @@ -2246,6 +2254,7 @@ xlog_state_do_callback( } #endif + flushcnt = 0; if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) { flushcnt = log->l_flushcnt; log->l_flushcnt = 0; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index eacb3d4987f2..ebbe93f4f97b 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -48,16 +48,10 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) */ /* - * Flags to xfs_log_mount - */ -#define XFS_LOG_RECOVER 0x1 - -/* * Flags to xfs_log_done() */ #define XFS_LOG_REL_PERM_RESERV 0x1 - /* * Flags to xfs_log_reserve() * @@ -70,8 +64,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XFS_LOG_SLEEP 0x0 #define XFS_LOG_NOSLEEP 0x1 #define XFS_LOG_PERM_RESERV 0x2 -#define XFS_LOG_RESV_ALL (XFS_LOG_NOSLEEP|XFS_LOG_PERM_RESERV) - /* * Flags to xfs_log_force() diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 34bcbf50789c..9bd3cdf11a87 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -32,7 +32,6 @@ struct xfs_mount; #define XLOG_MIN_ICLOGS 2 #define XLOG_MED_ICLOGS 4 #define XLOG_MAX_ICLOGS 8 -#define XLOG_CALLBACK_SIZE 10 #define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ #define XLOG_VERSION_1 1 #define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */ @@ -149,9 +148,6 @@ struct xfs_mount; #define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ #define XLOG_END_TRANS 0x10 /* End a continued transaction */ #define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ -#define XLOG_SKIP_TRANS (XLOG_COMMIT_TRANS | XLOG_CONTINUE_TRANS | \ - XLOG_WAS_CONT_TRANS | XLOG_END_TRANS | \ - XLOG_UNMOUNT_TRANS) #ifdef __KERNEL__ /* @@ -506,6 +502,12 @@ extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *); #define XLOG_TRACE_SLEEP_FLUSH 3 #define XLOG_TRACE_WAKE_FLUSH 4 +/* + * Unmount record type is used as a pseudo transaction type for the ticket. + * It's value must be outside the range of XFS_TRANS_* values. + */ +#define XLOG_UNMOUNT_REC_TYPE (-1U) + #endif /* __KERNEL__ */ #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b2bd4be4200a..e5f396ff9a3d 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -331,7 +331,7 @@ typedef struct xfs_mount { xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ lock_t m_agirotor_lock;/* .. and lock protecting it */ xfs_agnumber_t m_maxagi; /* highest inode alloc group */ - uint m_ihsize; /* size of next field */ + size_t m_ihsize; /* size of next field */ struct xfs_ihash *m_ihash; /* fs private inode hash table*/ struct xfs_inode *m_inodes; /* active inode list */ struct list_head m_del_inodes; /* inodes to reclaim */ @@ -541,7 +541,8 @@ static inline xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp) #define XFS_VFSTOM(vfs) xfs_vfstom(vfs) static inline xfs_mount_t *xfs_vfstom(bhv_vfs_t *vfs) { - return XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfs), &xfs_vfsops)); + return XFS_BHVTOM(bhv_lookup_range(VFS_BHVHEAD(vfs), + VFS_POSITION_XFS, VFS_POSITION_XFS)); } #define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d) diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index acb853b33ebb..9dcb32aa4e2e 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -281,8 +281,6 @@ typedef struct xfs_qoff_logformat { XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\ XFS_GQUOTA_ACCT) -#define XFS_MOUNT_QUOTA_MASK (XFS_MOUNT_QUOTA_ALL | XFS_UQUOTA_ACTIVE | \ - XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) /* diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 5a0b678956e0..880c73271c05 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1948,7 +1948,7 @@ xfs_growfs_rt( */ nrextents = nrblocks; do_div(nrextents, in->extsize); - nrbmblocks = roundup_64(nrextents, NBBY * sbp->sb_blocksize); + nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize); nrextslog = xfs_highbit32(nrextents); nrsumlevels = nrextslog + 1; nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks; @@ -1976,7 +1976,10 @@ xfs_growfs_rt( if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_sb.sb_rsumino))) return error; - nmp = NULL; + /* + * Allocate a new (fake) mount/sb. + */ + nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); /* * Loop over the bitmap blocks. * We will do everything one bitmap block at a time. @@ -1987,10 +1990,6 @@ xfs_growfs_rt( ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); bmbno < nrbmblocks; bmbno++) { - /* - * Allocate a new (fake) mount/sb. - */ - nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); *nmp = *mp; nsbp = &nmp->m_sb; /* @@ -2018,13 +2017,13 @@ xfs_growfs_rt( cancelflags = 0; if ((error = xfs_trans_reserve(tp, 0, XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0))) - goto error_exit; + break; /* * Lock out other callers by grabbing the bitmap inode lock. */ if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip))) - goto error_exit; + break; ASSERT(ip == mp->m_rbmip); /* * Update the bitmap inode's size. @@ -2038,7 +2037,7 @@ xfs_growfs_rt( */ if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0, XFS_ILOCK_EXCL, &ip))) - goto error_exit; + break; ASSERT(ip == mp->m_rsumip); /* * Update the summary inode's size. @@ -2053,7 +2052,7 @@ xfs_growfs_rt( mp->m_rsumlevels != nmp->m_rsumlevels) { error = xfs_rtcopy_summary(mp, nmp, tp); if (error) - goto error_exit; + break; } /* * Update superblock fields. @@ -2080,18 +2079,13 @@ xfs_growfs_rt( error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents, nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); if (error) - goto error_exit; + break; /* * Mark more blocks free in the superblock. */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, nsbp->sb_rextents - sbp->sb_rextents); /* - * Free the fake mp structure. - */ - kmem_free(nmp, sizeof(*nmp)); - nmp = NULL; - /* * Update mp values into the real mp structure. */ mp->m_rsumlevels = nrsumlevels; @@ -2101,15 +2095,15 @@ xfs_growfs_rt( */ xfs_trans_commit(tp, 0, NULL); } - return 0; + + if (error) + xfs_trans_cancel(tp, cancelflags); /* - * Error paths come here. + * Free the fake mp structure. */ -error_exit: - if (nmp) - kmem_free(nmp, sizeof(*nmp)); - xfs_trans_cancel(tp, cancelflags); + kmem_free(nmp, sizeof(*nmp)); + return error; } diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index bf168a91ddb8..467854b45c8f 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -60,10 +60,6 @@ struct xfs_mount; XFS_SB_VERSION_LOGV2BIT | \ XFS_SB_VERSION_SECTORBIT | \ XFS_SB_VERSION_MOREBITSBIT) -#define XFS_SB_VERSION_OKSASHBITS \ - (XFS_SB_VERSION_NUMBITS | \ - XFS_SB_VERSION_REALFBITS | \ - XFS_SB_VERSION_OKSASHFBITS) #define XFS_SB_VERSION_OKREALBITS \ (XFS_SB_VERSION_NUMBITS | \ XFS_SB_VERSION_OKREALFBITS | \ @@ -81,9 +77,6 @@ struct xfs_mount; #define XFS_SB_VERSION2_RESERVED2BIT 0x00000002 #define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 #define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ -#define XFS_SB_VERSION2_SASHFBITS 0xff000000 /* Mask: features that - require changing - PROM and SASH */ #define XFS_SB_VERSION2_OKREALFBITS \ (XFS_SB_VERSION2_ATTR2BIT) @@ -238,12 +231,6 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp) } #endif /* __KERNEL__ */ -#define XFS_SB_GOOD_SASH_VERSION(sbp) \ - ((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \ - ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - !((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKSASHBITS))) - #define XFS_SB_VERSION_TONEW(v) xfs_sb_version_tonew(v) static inline unsigned xfs_sb_version_tonew(unsigned v) { @@ -461,15 +448,6 @@ static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) * File system sector to basic block conversions. */ #define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log) -#define XFS_BB_TO_FSS(mp,bb) \ - (((bb) + (XFS_FSS_TO_BB(mp,1) - 1)) >> (mp)->m_sectbb_log) -#define XFS_BB_TO_FSST(mp,bb) ((bb) >> (mp)->m_sectbb_log) - -/* - * File system sector to byte conversions. - */ -#define XFS_FSS_TO_B(mp,sectno) ((xfs_fsize_t)(sectno) << (mp)->m_sb.sb_sectlog) -#define XFS_B_TO_FSST(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_sectlog) /* * File system block to basic block conversions. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9dc88b380608..c68e00105d23 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -149,7 +149,6 @@ typedef struct xfs_item_ops { void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); void (*iop_push)(xfs_log_item_t *); - void (*iop_abort)(xfs_log_item_t *); void (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); } xfs_item_ops_t; @@ -163,7 +162,6 @@ typedef struct xfs_item_ops { #define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) #define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) #define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip) -#define IOP_ABORT(ip) (*(ip)->li_ops->iop_abort)(ip) #define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip) #define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 558c87ff0c41..fc39b166d403 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -276,7 +276,7 @@ xfs_trans_update_ail( xfs_mount_t *mp, xfs_log_item_t *lip, xfs_lsn_t lsn, - unsigned long s) + unsigned long s) __releases(mp->m_ail_lock) { xfs_ail_entry_t *ailp; xfs_log_item_t *dlip=NULL; @@ -328,7 +328,7 @@ void xfs_trans_delete_ail( xfs_mount_t *mp, xfs_log_item_t *lip, - unsigned long s) + unsigned long s) __releases(mp->m_ail_lock) { xfs_ail_entry_t *ailp; xfs_log_item_t *dlip; diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 13edab8a9e94..447ac4308c91 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -46,11 +46,13 @@ xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, /* * From xfs_trans_ail.c */ -void xfs_trans_update_ail(struct xfs_mount *, - struct xfs_log_item *, xfs_lsn_t, - unsigned long); -void xfs_trans_delete_ail(struct xfs_mount *, - struct xfs_log_item *, unsigned long); +void xfs_trans_update_ail(struct xfs_mount *mp, + struct xfs_log_item *lip, xfs_lsn_t lsn, + unsigned long s) + __releases(mp->m_ail_lock); +void xfs_trans_delete_ail(struct xfs_mount *mp, + struct xfs_log_item *lip, unsigned long s) + __releases(mp->m_ail_lock); struct xfs_log_item *xfs_trans_first_ail(struct xfs_mount *, int *); struct xfs_log_item *xfs_trans_next_ail(struct xfs_mount *, struct xfs_log_item *, int *, int *); diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index a34796e57afb..62336a4cc5a4 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -1922,7 +1922,7 @@ xfs_showargs( } if (mp->m_flags & XFS_MOUNT_IHASHSIZE) - seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", mp->m_ihsize); + seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", (int)mp->m_ihsize); if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk", diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 23cfa5837728..061e2ffdd1de 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2366,10 +2366,15 @@ xfs_remove( namelen = VNAMELEN(dentry); + if (!xfs_get_dir_entry(dentry, &ip)) { + dm_di_mode = ip->i_d.di_mode; + IRELE(ip); + } + if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - name, NULL, 0, 0, 0); + name, NULL, dm_di_mode, 0, 0); if (error) return error; } @@ -2995,7 +3000,7 @@ xfs_rmdir( int cancel_flags; int committed; bhv_vnode_t *dir_vp; - int dm_di_mode = 0; + int dm_di_mode = S_IFDIR; int last_cdp_link; int namelen; uint resblks; @@ -3010,11 +3015,16 @@ xfs_rmdir( return XFS_ERROR(EIO); namelen = VNAMELEN(dentry); + if (!xfs_get_dir_entry(dentry, &cdp)) { + dm_di_mode = cdp->i_d.di_mode; + IRELE(cdp); + } + if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) { error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp, DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - name, NULL, 0, 0, 0); + name, NULL, dm_di_mode, 0, 0); if (error) return XFS_ERROR(error); } @@ -3834,7 +3844,9 @@ xfs_reclaim( XFS_MOUNT_ILOCK(mp); vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip)); list_add_tail(&ip->i_reclaim, &mp->m_del_inodes); + spin_lock(&ip->i_flags_lock); ip->i_flags |= XFS_IRECLAIMABLE; + spin_unlock(&ip->i_flags_lock); XFS_MOUNT_IUNLOCK(mp); } return 0; @@ -3859,8 +3871,10 @@ xfs_finish_reclaim( * us. */ write_lock(&ih->ih_lock); + spin_lock(&ip->i_flags_lock); if ((ip->i_flags & XFS_IRECLAIM) || (!(ip->i_flags & XFS_IRECLAIMABLE) && vp == NULL)) { + spin_unlock(&ip->i_flags_lock); write_unlock(&ih->ih_lock); if (locked) { xfs_ifunlock(ip); @@ -3869,6 +3883,7 @@ xfs_finish_reclaim( return 1; } ip->i_flags |= XFS_IRECLAIM; + spin_unlock(&ip->i_flags_lock); write_unlock(&ih->ih_lock); /* @@ -4272,7 +4287,7 @@ xfs_free_file_space( xfs_mount_t *mp; int nimap; uint resblks; - int rounding; + uint rounding; int rt; xfs_fileoff_t startoffset_fsb; xfs_trans_t *tp; @@ -4313,8 +4328,7 @@ xfs_free_file_space( vn_iowait(vp); /* wait for the completion of any pending DIOs */ } - rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog), - (__uint8_t)NBPP); + rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP); ilen = len + (offset & (rounding - 1)); ioffset = offset & ~(rounding - 1); if (ilen & (rounding - 1)) |