From 7d35079f8277b653d6a3075eea9edd4dbf7c2b29 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:46 -0700 Subject: kernfs: use idr instead of ida to manage inode number kernfs uses ida to manage inode number. The problem is we can't get kernfs_node from inode number with ida. Switching to use idr, next patch will add an API to get kernfs_node from inode number. Acked-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index db5900aaa55a..8ad7a17895fe 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -21,6 +21,7 @@ DEFINE_MUTEX(kernfs_mutex); static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ +static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) @@ -533,7 +534,9 @@ void kernfs_put(struct kernfs_node *kn) simple_xattrs_free(&kn->iattr->xattrs); } kfree(kn->iattr); - ida_simple_remove(&root->ino_ida, kn->ino); + spin_lock(&kernfs_idr_lock); + idr_remove(&root->ino_idr, kn->ino); + spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); kn = parent; @@ -542,7 +545,7 @@ void kernfs_put(struct kernfs_node *kn) goto repeat; } else { /* just released the root kn, free @root too */ - ida_destroy(&root->ino_ida); + idr_destroy(&root->ino_idr); kfree(root); } } @@ -630,7 +633,11 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + idr_preload(GFP_KERNEL); + spin_lock(&kernfs_idr_lock); + ret = idr_alloc(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); + spin_unlock(&kernfs_idr_lock); + idr_preload_end(); if (ret < 0) goto err_out2; kn->ino = ret; @@ -875,13 +882,13 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, if (!root) return ERR_PTR(-ENOMEM); - ida_init(&root->ino_ida); + idr_init(&root->ino_idr); INIT_LIST_HEAD(&root->supers); kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR); if (!kn) { - ida_destroy(&root->ino_ida); + idr_destroy(&root->ino_idr); kfree(root); return ERR_PTR(-ENOMEM); } -- cgit v1.2.3-55-g7522 From 4a3ef68acacf31570066e69593de5cc49cc91638 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:47 -0700 Subject: kernfs: implement i_generation Set i_generation for kernfs inode. This is required to implement exportfs operations. The generation is 32-bit, so it's possible the generation wraps up and we find stale files. To reduce the posssibility, we don't reuse inode numer immediately. When the inode number allocation wraps, we increase generation number. In this way generation/inode number consist of a 64-bit number which is unlikely duplicated. This does make the idr tree more sparse and waste some memory. Since idr manages 32-bit keys, idr uses a 6-level radix tree, each level covers 6 bits of the key. In a 100k inode kernfs, the worst case will have around 300k radix tree node. Each node is 576bytes, so the tree will use about ~150M memory. Sounds not too bad, if this really is a problem, we should find better data structure. Acked-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 10 +++++++++- fs/kernfs/inode.c | 1 + include/linux/kernfs.h | 2 ++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 8ad7a17895fe..33f711f6b86e 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -623,6 +623,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, unsigned flags) { struct kernfs_node *kn; + u32 gen; + int cursor; int ret; name = kstrdup_const(name, GFP_KERNEL); @@ -635,12 +637,17 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); - ret = idr_alloc(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); + cursor = idr_get_cursor(&root->ino_idr); + ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); + if (ret >= 0 && ret < cursor) + root->next_generation++; + gen = root->next_generation; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; kn->ino = ret; + kn->generation = gen; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); @@ -884,6 +891,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, idr_init(&root->ino_idr); INIT_LIST_HEAD(&root->supers); + root->next_generation = 1; kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index fb4b4a79a0d6..79cdae4758fb 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -220,6 +220,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; inode->i_op = &kernfs_iops; + inode->i_generation = kn->generation; set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5f5d602eb433..8c00d28f468a 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -135,6 +135,7 @@ struct kernfs_node { umode_t mode; unsigned int ino; struct kernfs_iattrs *iattr; + u32 generation; }; /* @@ -164,6 +165,7 @@ struct kernfs_root { /* private fields, do not use outside kernfs proper */ struct idr ino_idr; + u32 next_generation; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_mutex */ -- cgit v1.2.3-55-g7522 From ba16b2846a8c6965d0d35be3968bc10f6277812d Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:48 -0700 Subject: kernfs: add an API to get kernfs node from inode number Add an API to get kernfs node from inode number. We will need this to implement exportfs operations. This API will be used in blktrace too later, so it should be as fast as possible. To make the API lock free, kernfs node is freed in RCU context. And we depend on kernfs_node count/ino number to filter out stale kernfs nodes. Acked-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ fs/kernfs/kernfs-internal.h | 2 ++ fs/kernfs/mount.c | 11 ++++++++- 3 files changed, 69 insertions(+), 1 deletion(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 33f711f6b86e..7be37c838007 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -508,6 +508,10 @@ void kernfs_put(struct kernfs_node *kn) struct kernfs_node *parent; struct kernfs_root *root; + /* + * kernfs_node is freed with ->count 0, kernfs_find_and_get_node_by_ino + * depends on this to filter reused stale node + */ if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); @@ -649,6 +653,11 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, kn->ino = ret; kn->generation = gen; + /* + * set ino first. This barrier is paired with atomic_inc_not_zero in + * kernfs_find_and_get_node_by_ino + */ + smp_mb__before_atomic(); atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); @@ -680,6 +689,54 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, return kn; } +/* + * kernfs_find_and_get_node_by_ino - get kernfs_node from inode number + * @root: the kernfs root + * @ino: inode number + * + * RETURNS: + * NULL on failure. Return a kernfs node with reference counter incremented + */ +struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, + unsigned int ino) +{ + struct kernfs_node *kn; + + rcu_read_lock(); + kn = idr_find(&root->ino_idr, ino); + if (!kn) + goto out; + + /* + * Since kernfs_node is freed in RCU, it's possible an old node for ino + * is freed, but reused before RCU grace period. But a freed node (see + * kernfs_put) or an incompletedly initialized node (see + * __kernfs_new_node) should have 'count' 0. We can use this fact to + * filter out such node. + */ + if (!atomic_inc_not_zero(&kn->count)) { + kn = NULL; + goto out; + } + + /* + * The node could be a new node or a reused node. If it's a new node, + * we are ok. If it's reused because of RCU (because of + * SLAB_TYPESAFE_BY_RCU), the __kernfs_new_node always sets its 'ino' + * before 'count'. So if 'count' is uptodate, 'ino' should be uptodate, + * hence we can use 'ino' to filter stale node. + */ + if (kn->ino != ino) + goto out; + rcu_read_unlock(); + + return kn; +out: + rcu_read_unlock(); + kernfs_put(kn); + return NULL; +} + /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 2d5144ab4251..e9c226f29828 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -98,6 +98,8 @@ int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, unsigned flags); +struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, + unsigned int ino); /* * file.c diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d5b149a45be1..69c48bec8a63 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -330,7 +330,16 @@ struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) void __init kernfs_init(void) { + + /* + * the slab is freed in RCU context, so kernfs_find_and_get_node_by_ino + * can access the slab lock free. This could introduce stale nodes, + * please see how kernfs_find_and_get_node_by_ino filters out stale + * nodes. + */ kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), - 0, SLAB_PANIC, NULL); + 0, + SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, + NULL); } -- cgit v1.2.3-55-g7522 From 319ba91d352a74acb47678788109a14b9b4dd4c2 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:49 -0700 Subject: kernfs: don't set dentry->d_fsdata When working on adding exportfs operations in kernfs, I found it's hard to initialize dentry->d_fsdata in the exportfs operations. Looks there is no way to do it without race condition. Look at the kernfs code closely, there is no point to set dentry->d_fsdata. inode->i_private already points to kernfs_node, and we can get inode from a dentry. So this patch just delete the d_fsdata usage. Acked-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 25 +++++++++---------------- fs/kernfs/file.c | 6 +++--- fs/kernfs/inode.c | 6 +++--- fs/kernfs/kernfs-internal.h | 7 +++++++ fs/kernfs/mount.c | 8 ++------ fs/kernfs/symlink.c | 6 +++--- 6 files changed, 27 insertions(+), 31 deletions(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 7be37c838007..b61a7efceb7a 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -566,7 +566,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) if (d_really_is_negative(dentry)) goto out_bad_unlocked; - kn = dentry->d_fsdata; + kn = kernfs_dentry_node(dentry); mutex_lock(&kernfs_mutex); /* The kernfs node has been deactivated */ @@ -574,7 +574,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) goto out_bad; /* The kernfs node has been moved? */ - if (dentry->d_parent->d_fsdata != kn->parent) + if (kernfs_dentry_node(dentry->d_parent) != kn->parent) goto out_bad; /* The kernfs node has been renamed */ @@ -594,14 +594,8 @@ out_bad_unlocked: return 0; } -static void kernfs_dop_release(struct dentry *dentry) -{ - kernfs_put(dentry->d_fsdata); -} - const struct dentry_operations kernfs_dops = { .d_revalidate = kernfs_dop_revalidate, - .d_release = kernfs_dop_release, }; /** @@ -617,8 +611,9 @@ const struct dentry_operations kernfs_dops = { */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { - if (dentry->d_sb->s_op == &kernfs_sops) - return dentry->d_fsdata; + if (dentry->d_sb->s_op == &kernfs_sops && + !d_really_is_negative(dentry)) + return kernfs_dentry_node(dentry); return NULL; } @@ -1056,7 +1051,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, unsigned int flags) { struct dentry *ret; - struct kernfs_node *parent = dentry->d_parent->d_fsdata; + struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; struct inode *inode; const void *ns = NULL; @@ -1073,8 +1068,6 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, ret = NULL; goto out_unlock; } - kernfs_get(kn); - dentry->d_fsdata = kn; /* attach dentry and inode */ inode = kernfs_get_inode(dir->i_sb, kn); @@ -1111,7 +1104,7 @@ static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; @@ -1131,7 +1124,7 @@ static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct kernfs_node *kn = old_dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(old_dentry); struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; @@ -1644,7 +1637,7 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns, static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct kernfs_node *parent = dentry->d_fsdata; + struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; const void *ns = NULL; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index ac2dfe0c5a9c..7f90d4de86b6 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -616,7 +616,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn, static int kernfs_fop_open(struct inode *inode, struct file *file) { - struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; @@ -768,7 +768,7 @@ static void kernfs_release_file(struct kernfs_node *kn, static int kernfs_fop_release(struct inode *inode, struct file *filp) { - struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); if (kn->flags & KERNFS_HAS_RELEASE) { @@ -835,7 +835,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn) static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); - struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); struct kernfs_open_node *on = kn->attr.open; if (!kernfs_get_active(kn)) diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 79cdae4758fb..4c8b51085a86 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -112,7 +112,7 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; int error; if (!kn) @@ -154,7 +154,7 @@ static int kernfs_node_setsecdata(struct kernfs_iattrs *attrs, void **secdata, ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_iattrs *attrs; attrs = kernfs_iattrs(kn); @@ -203,8 +203,8 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) int kernfs_iop_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - struct kernfs_node *kn = path->dentry->d_fsdata; struct inode *inode = d_inode(path->dentry); + struct kernfs_node *kn = inode->i_private; mutex_lock(&kernfs_mutex); kernfs_refresh_inode(kn, inode); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index e9c226f29828..0f260dcca177 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -70,6 +70,13 @@ struct kernfs_super_info { }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) +static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) +{ + if (d_really_is_negative(dentry)) + return NULL; + return d_inode(dentry)->i_private; +} + extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache; diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 69c48bec8a63..acd542625fd8 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -33,7 +33,7 @@ static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data) static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { - struct kernfs_root *root = kernfs_root(dentry->d_fsdata); + struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry)); struct kernfs_syscall_ops *scops = root->syscall_ops; if (scops && scops->show_options) @@ -43,7 +43,7 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) { - struct kernfs_node *node = dentry->d_fsdata; + struct kernfs_node *node = kernfs_dentry_node(dentry); struct kernfs_root *root = kernfs_root(node); struct kernfs_syscall_ops *scops = root->syscall_ops; @@ -176,8 +176,6 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) pr_debug("%s: could not get root dentry!\n", __func__); return -ENOMEM; } - kernfs_get(info->root->kn); - root->d_fsdata = info->root->kn; sb->s_root = root; sb->s_d_op = &kernfs_dops; return 0; @@ -283,7 +281,6 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); - struct kernfs_node *root_kn = sb->s_root->d_fsdata; mutex_lock(&kernfs_mutex); list_del(&info->node); @@ -295,7 +292,6 @@ void kernfs_kill_sb(struct super_block *sb) */ kill_anon_super(sb); kfree(info); - kernfs_put(root_kn); } /** diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 1684af4a8b9b..08ccabd7047f 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -98,9 +98,9 @@ static int kernfs_get_target_path(struct kernfs_node *parent, return 0; } -static int kernfs_getlink(struct dentry *dentry, char *path) +static int kernfs_getlink(struct inode *inode, char *path) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent = kn->parent; struct kernfs_node *target = kn->symlink.target_kn; int error; @@ -124,7 +124,7 @@ static const char *kernfs_iop_get_link(struct dentry *dentry, body = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!body) return ERR_PTR(-ENOMEM); - error = kernfs_getlink(dentry, body); + error = kernfs_getlink(inode, body); if (unlikely(error < 0)) { kfree(body); return ERR_PTR(error); -- cgit v1.2.3-55-g7522 From c53cd490b1a491ebf1d8e30da97e7231459a4208 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:50 -0700 Subject: kernfs: introduce kernfs_node_id inode number and generation can identify a kernfs node. We are going to export the identification by exportfs operations, so put ino and generation into a separate structure. It's convenient when later patches use the identification. Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 10 +++++----- fs/kernfs/file.c | 4 ++-- fs/kernfs/inode.c | 4 ++-- include/linux/cgroup.h | 2 +- include/linux/kernfs.h | 12 ++++++++++-- include/trace/events/writeback.h | 2 +- 6 files changed, 21 insertions(+), 13 deletions(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index b61a7efceb7a..89d1dc19340b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -539,7 +539,7 @@ void kernfs_put(struct kernfs_node *kn) } kfree(kn->iattr); spin_lock(&kernfs_idr_lock); - idr_remove(&root->ino_idr, kn->ino); + idr_remove(&root->ino_idr, kn->id.ino); spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); @@ -645,8 +645,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, idr_preload_end(); if (ret < 0) goto err_out2; - kn->ino = ret; - kn->generation = gen; + kn->id.ino = ret; + kn->id.generation = gen; /* * set ino first. This barrier is paired with atomic_inc_not_zero in @@ -721,7 +721,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, * before 'count'. So if 'count' is uptodate, 'ino' should be uptodate, * hence we can use 'ino' to filter stale node. */ - if (kn->ino != ino) + if (kn->id.ino != ino) goto out; rcu_read_unlock(); @@ -1654,7 +1654,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) const char *name = pos->name; unsigned int type = dt_type(pos); int len = strlen(name); - ino_t ino = pos->ino; + ino_t ino = pos->id.ino; ctx->pos = pos->hash; file->private_data = pos; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 7f90d4de86b6..744192539010 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -895,7 +895,7 @@ repeat: * have the matching @file available. Look up the inodes * and generate the events manually. */ - inode = ilookup(info->sb, kn->ino); + inode = ilookup(info->sb, kn->id.ino); if (!inode) continue; @@ -903,7 +903,7 @@ repeat: if (parent) { struct inode *p_inode; - p_inode = ilookup(info->sb, parent->ino); + p_inode = ilookup(info->sb, parent->id.ino); if (p_inode) { fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, kn->name, 0); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 4c8b51085a86..a34303981deb 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -220,7 +220,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; inode->i_op = &kernfs_iops; - inode->i_generation = kn->generation; + inode->i_generation = kn->id.generation; set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); @@ -266,7 +266,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; - inode = iget_locked(sb, kn->ino); + inode = iget_locked(sb, kn->id.ino); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 710a005c6b7a..30c68773fd1e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -543,7 +543,7 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp) /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { - return cgrp->kn->ino; + return cgrp->kn->id.ino; } /* cft/css accessors for cftype->write() operation */ diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 8c00d28f468a..06a0c5913e1d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -95,6 +95,15 @@ struct kernfs_elem_attr { struct kernfs_node *notify_next; /* for kernfs_notify() */ }; +/* represent a kernfs node */ +union kernfs_node_id { + struct { + u32 ino; + u32 generation; + }; + u64 id; +}; + /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are @@ -131,11 +140,10 @@ struct kernfs_node { void *priv; + union kernfs_node_id id; unsigned short flags; umode_t mode; - unsigned int ino; struct kernfs_iattrs *iattr; - u32 generation; }; /* diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 7bd8783a590f..9b57f014d79d 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -136,7 +136,7 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, static inline unsigned int __trace_wb_assign_cgroup(struct bdi_writeback *wb) { - return wb->memcg_css->cgroup->kn->ino; + return wb->memcg_css->cgroup->kn->id.ino; } static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *wbc) -- cgit v1.2.3-55-g7522 From aa8188253474b4053bc2900d9fcb545ce68bdf5c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:51 -0700 Subject: kernfs: add exportfs operations Now we have the facilities to implement exportfs operations. The idea is cgroup can export the fhandle info to userspace, then userspace uses fhandle to find the cgroup name. Another example is userspace can get fhandle for a cgroup and BPF uses the fhandle to filter info for the cgroup. Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/mount.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/kernfs.h | 12 +++++++++++ kernel/cgroup/cgroup.c | 3 ++- 3 files changed, 70 insertions(+), 1 deletion(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index acd542625fd8..fa323589704f 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "kernfs-internal.h" @@ -64,6 +65,59 @@ const struct super_operations kernfs_sops = { .show_path = kernfs_sop_show_path, }; +static struct inode *kernfs_fh_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct inode *inode; + struct kernfs_node *kn; + + if (ino == 0) + return ERR_PTR(-ESTALE); + + kn = kernfs_find_and_get_node_by_ino(info->root, ino); + if (!kn) + return ERR_PTR(-ESTALE); + inode = kernfs_get_inode(sb, kn); + kernfs_put(kn); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + kernfs_fh_get_inode); +} + +static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + kernfs_fh_get_inode); +} + +static struct dentry *kernfs_get_parent_dentry(struct dentry *child) +{ + struct kernfs_node *kn = kernfs_dentry_node(child); + + return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent)); +} + +static const struct export_operations kernfs_export_ops = { + .fh_to_dentry = kernfs_fh_to_dentry, + .fh_to_parent = kernfs_fh_to_parent, + .get_parent = kernfs_get_parent_dentry, +}; + /** * kernfs_root_from_sb - determine kernfs_root associated with a super_block * @sb: the super_block in question @@ -159,6 +213,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_magic = magic; sb->s_op = &kernfs_sops; sb->s_xattr = kernfs_xattr_handlers; + if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) + sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; /* get root inode, initialize and unlock it */ diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 06a0c5913e1d..d149361e5875 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -69,6 +69,12 @@ enum kernfs_root_flag { * following flag enables that behavior. */ KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002, + + /* + * The filesystem supports exportfs operation, so userspace can use + * fhandle to access nodes of the fs. + */ + KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004, }; /* type-specific structures for kernfs_node union members */ @@ -98,6 +104,12 @@ struct kernfs_elem_attr { /* represent a kernfs node */ union kernfs_node_id { struct { + /* + * blktrace will export this struct as a simplified 'struct + * fid' (which is a big data struction), so userspace can use + * it to find kernfs node. The layout must match the first two + * fields of 'struct fid' exactly. + */ u32 ino; u32 generation; }; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 620794a20a33..6cefa277f39c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1737,7 +1737,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, - KERNFS_ROOT_CREATE_DEACTIVATED, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_SUPPORT_EXPORTOP, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); -- cgit v1.2.3-55-g7522 From 69fd5c391763bd94a40dd152bc72a7f230137150 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:55 -0700 Subject: blktrace: add an option to allow displaying cgroup path By default we output cgroup id in blktrace. This adds an option to display cgroup path. Since get cgroup path is a relativly heavy operation, we don't enable it by default. with the option enabled, blktrace will output something like this: dd-1353 [007] d..2 293.015252: 8,0 /test/level D R 24 + 8 [dd] Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- fs/kernfs/mount.c | 19 +++++++++++++++++++ include/linux/cgroup.h | 6 ++++++ include/linux/kernfs.h | 2 ++ kernel/cgroup/cgroup.c | 12 ++++++++++++ kernel/trace/blktrace.c | 14 +++++++++++++- 5 files changed, 52 insertions(+), 1 deletion(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index fa323589704f..7c452f4d83e9 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -65,6 +65,25 @@ const struct super_operations kernfs_sops = { .show_path = kernfs_sop_show_path, }; +/* + * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode + * number and generation + */ +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, + const union kernfs_node_id *id) +{ + struct kernfs_node *kn; + + kn = kernfs_find_and_get_node_by_ino(root, id->ino); + if (!kn) + return NULL; + if (kn->id.generation != id->generation) { + kernfs_put(kn); + return NULL; + } + return kn; +} + static struct inode *kernfs_fh_get_inode(struct super_block *sb, u64 ino, u32 generation) { diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 52ef9a68ff14..6144fe923b73 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -613,6 +613,9 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) { return &cgrp->kn->id; } + +void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -645,6 +648,9 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, { return true; } + +static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ /* diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index d149361e5875..ab25c8b6d9e3 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -358,6 +358,8 @@ struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns); void kernfs_init(void); +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, + const union kernfs_node_id *id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6cefa277f39c..2aba1c519138 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4701,6 +4701,18 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); +void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen) +{ + struct kernfs_node *kn; + + kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + if (!kn) + return; + kernfs_path(kn, buf, buflen); + kernfs_put(kn); +} + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f393d7a43695..e90974ed4532 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -48,12 +48,14 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 #define TRACE_BLK_OPT_CGROUP 0x2 +#define TRACE_BLK_OPT_CGNAME 0x4 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, #ifdef CONFIG_BLK_CGROUP { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, + { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) }, #endif { } }; @@ -1213,7 +1215,17 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, if (has_cg) { const union kernfs_node_id *id = cgid_start(iter->ent); - trace_seq_printf(&iter->seq, "%3d,%-3d %x,%-x %2s %3s ", + if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { + char blkcg_name_buf[NAME_MAX + 1] = "<...>"; + + cgroup_path_from_kernfs_id(id, blkcg_name_buf, + sizeof(blkcg_name_buf)); + trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", + MAJOR(t->device), MINOR(t->device), + blkcg_name_buf, act, rwbs); + } else + trace_seq_printf(&iter->seq, + "%3d,%-3d %x,%-x %2s %3s ", MAJOR(t->device), MINOR(t->device), id->ino, id->generation, act, rwbs); } else -- cgit v1.2.3-55-g7522 From ef13ecbc134d7e0ca4ab4834d08bd20885b53c62 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 30 Aug 2017 17:04:56 +0300 Subject: kernfs: checking for IS_ERR() instead of NULL The kernfs_get_inode() returns NULL on error, it never returns error pointers. Fixes: aa8188253474 ("kernfs: add exportfs operations") Acked-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- fs/kernfs/mount.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/kernfs') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 7c452f4d83e9..95a7c88baed9 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -99,8 +99,8 @@ static struct inode *kernfs_fh_get_inode(struct super_block *sb, return ERR_PTR(-ESTALE); inode = kernfs_get_inode(sb, kn); kernfs_put(kn); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (!inode) + return ERR_PTR(-ESTALE); if (generation && inode->i_generation != generation) { /* we didn't find the right inode.. */ -- cgit v1.2.3-55-g7522