diff options
Diffstat (limited to 'src/kernel')
-rw-r--r-- | src/kernel/xloop_file_fmt_qcow_main.c | 33 | ||||
-rw-r--r-- | src/kernel/xloop_file_fmt_raw.c | 5 | ||||
-rw-r--r-- | src/kernel/xloop_main.c | 1197 | ||||
-rw-r--r-- | src/kernel/xloop_main.h | 46 |
4 files changed, 755 insertions, 526 deletions
diff --git a/src/kernel/xloop_file_fmt_qcow_main.c b/src/kernel/xloop_file_fmt_qcow_main.c index e642160..4fc2fe3 100644 --- a/src/kernel/xloop_file_fmt_qcow_main.c +++ b/src/kernel/xloop_file_fmt_qcow_main.c @@ -21,6 +21,7 @@ #include <linux/mutex.h> #include <linux/uio.h> #include <linux/string.h> +#include <linux/version.h> #include <linux/vmalloc.h> #include <linux/zlib.h> #ifdef CONFIG_ZSTD_DECOMPRESS @@ -941,7 +942,9 @@ static int __qcow_file_fmt_read_compressed(struct xloop_file_fmt *xlo_fmt, struc u8 *in_buf = NULL; ssize_t len; void *data; +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) unsigned long irq_flags; +#endif int offset_in_cluster = xloop_file_fmt_qcow_offset_into_cluster(qcow_data, offset); coffset = file_cluster_offset & qcow_data->cluster_offset_mask; @@ -971,10 +974,18 @@ static int __qcow_file_fmt_read_compressed(struct xloop_file_fmt *xlo_fmt, struc } ASSERT(bytes <= bvec->bv_len); - data = bvec_kmap_irq(bvec, &irq_flags) + bytes_done; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) + data = bvec_kmap_local(bvec) + bytes_done; +#else + data = bvec_kmap_irq(bvec, &irq_flags) + bytes_done; +#endif memcpy(data, qcow_data->cmp_out_buf + offset_in_cluster, bytes); flush_dcache_page(bvec->bv_page); - bvec_kunmap_irq(data, &irq_flags); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) + kunmap_local(data); +#else + bvec_kunmap_irq(data, &irq_flags); +#endif out_free_in_buf: vfree(in_buf); @@ -994,7 +1005,9 @@ static int __qcow_file_fmt_read_bvec(struct xloop_file_fmt *xlo_fmt, struct bio_ u64 bytes_done = 0; enum xloop_file_fmt_qcow_subcluster_type type; void *data; +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) unsigned long irq_flags; +#endif ssize_t len; loff_t pos_read; @@ -1015,10 +1028,18 @@ static int __qcow_file_fmt_read_bvec(struct xloop_file_fmt *xlo_fmt, struct bio_ case QCOW_SUBCLUSTER_ZERO_ALLOC: case QCOW_SUBCLUSTER_UNALLOCATED_PLAIN: case QCOW_SUBCLUSTER_UNALLOCATED_ALLOC: +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) + data = bvec_kmap_local(bvec) + bytes_done; +#else data = bvec_kmap_irq(bvec, &irq_flags) + bytes_done; +#endif memset(data, 0, cur_bytes); flush_dcache_page(bvec->bv_page); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) + kunmap_local(data); +#else bvec_kunmap_irq(data, &irq_flags); +#endif break; case QCOW_SUBCLUSTER_COMPRESSED: @@ -1031,10 +1052,18 @@ static int __qcow_file_fmt_read_bvec(struct xloop_file_fmt *xlo_fmt, struct bio_ case QCOW_SUBCLUSTER_NORMAL: pos_read = host_offset; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) + data = bvec_kmap_local(bvec) + bytes_done; +#else data = bvec_kmap_irq(bvec, &irq_flags) + bytes_done; +#endif len = kernel_read(xlo->xlo_backing_file, data, cur_bytes, &pos_read); flush_dcache_page(bvec->bv_page); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) + kunmap_local(data); +#else bvec_kunmap_irq(data, &irq_flags); +#endif if (len < 0) return len; diff --git a/src/kernel/xloop_file_fmt_raw.c b/src/kernel/xloop_file_fmt_raw.c index 1c2c42b..2fc8ace 100644 --- a/src/kernel/xloop_file_fmt_raw.c +++ b/src/kernel/xloop_file_fmt_raw.c @@ -152,8 +152,6 @@ static void __raw_file_fmt_rw_aio_complete(struct kiocb *iocb, long ret, long re { struct xloop_cmd *cmd = container_of(iocb, struct xloop_cmd, iocb); - if (cmd->css) - css_put(cmd->css); cmd->ret = ret; __raw_file_fmt_rw_aio_do_completion(cmd); } @@ -224,8 +222,6 @@ static int __raw_file_fmt_rw_aio(struct xloop_device *xlo, struct xloop_cmd *cmd cmd->iocb.ki_complete = __raw_file_fmt_rw_aio_complete; cmd->iocb.ki_flags = IOCB_DIRECT; cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); - if (cmd->css) - kthread_associate_blkcg(cmd->css); if (rw == WRITE) ret = call_write_iter(file, &cmd->iocb, &iter); @@ -233,7 +229,6 @@ static int __raw_file_fmt_rw_aio(struct xloop_device *xlo, struct xloop_cmd *cmd ret = call_read_iter(file, &cmd->iocb, &iter); __raw_file_fmt_rw_aio_do_completion(cmd); - kthread_associate_blkcg(NULL); if (ret != -EIOCBQUEUED) cmd->iocb.ki_complete(&cmd->iocb, ret, 0); diff --git a/src/kernel/xloop_main.c b/src/kernel/xloop_main.c index 9fe0c70..9585e24 100644 --- a/src/kernel/xloop_main.c +++ b/src/kernel/xloop_main.c @@ -59,6 +59,7 @@ #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/fs.h> +#include <linux/pagemap.h> #include <linux/file.h> #include <linux/stat.h> #include <linux/errno.h> @@ -76,7 +77,6 @@ #include <linux/writeback.h> #include <linux/completion.h> #include <linux/highmem.h> -#include <linux/kthread.h> #include <linux/splice.h> #include <linux/sysfs.h> #include <linux/miscdevice.h> @@ -84,6 +84,7 @@ #include <linux/uio.h> #include <linux/ioprio.h> #include <linux/blk-cgroup.h> +#include <linux/sched/mm.h> #include <linux/version.h> #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> @@ -96,15 +97,51 @@ #include <linux/uaccess.h> -/* define RHEL_CHECK_VERSION macro to check CentOS version */ -#if defined(RHEL_RELEASE_CODE) && defined(RHEL_RELEASE_VERSION) -#define RHEL_CHECK_VERSION(CONDITION) (CONDITION) -#else -#define RHEL_CHECK_VERSION(CONDITION) (0) -#endif +#define XLOOP_IDLE_WORKER_TIMEOUT (60 * HZ) static DEFINE_IDR(xloop_index_idr); static DEFINE_MUTEX(xloop_ctl_mutex); +static DEFINE_MUTEX(xloop_validate_mutex); + +/** + * xloop_global_lock_killable() - take locks for safe xloop_validate_file() test + * + * @xlo: struct xloop_device + * @global: true if @xlo is about to bind another "struct xloop_device", false otherwise + * + * Returns 0 on success, -EINTR otherwise. + * + * Since xloop_validate_file() traverses on other "struct xloop_device" if + * is_xloop_device() is true, we need a global lock for serializing concurrent + * xloop_configure()/xloop_change_fd()/__xloop_clr_fd() calls. + */ +static int xloop_global_lock_killable(struct xloop_device *xlo, bool global) +{ + int err; + + if (global) { + err = mutex_lock_killable(&xloop_validate_mutex); + if (err) + return err; + } + err = mutex_lock_killable(&xlo->xlo_mutex); + if (err && global) + mutex_unlock(&xloop_validate_mutex); + return err; +} + +/** + * xloop_global_unlock() - release locks taken by xloop_global_lock_killable() + * + * @xlo: struct xloop_device + * @global: true if @xlo was about to bind another "struct xloop_device", false otherwise + */ +static void xloop_global_unlock(struct xloop_device *xlo, bool global) +{ + mutex_unlock(&xlo->xlo_mutex); + if (global) + mutex_unlock(&xloop_validate_mutex); +} static int max_part; static int part_shift; @@ -115,8 +152,10 @@ struct device *xloop_device_to_dev(struct xloop_device *xlo) } EXPORT_SYMBOL(xloop_device_to_dev); -static int transfer_xor(struct xloop_device *xlo, int cmd, struct page *raw_page, unsigned int raw_off, - struct page *xloop_page, unsigned int xloop_off, int size, sector_t real_block) +static int transfer_xor(struct xloop_device *xlo, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *xloop_page, unsigned xloop_off, + int size, sector_t real_block) { char *raw_buf = kmap_atomic(raw_page) + raw_off; char *xloop_buf = kmap_atomic(xloop_page) + xloop_off; @@ -151,12 +190,19 @@ static int xor_init(struct xloop_device *xlo, const struct xloop_info64 *info) static struct xloop_func_table none_funcs = { .number = XLO_CRYPT_NONE, -}; +}; -static struct xloop_func_table xor_funcs = { .number = XLO_CRYPT_XOR, .transfer = transfer_xor, .init = xor_init }; +static struct xloop_func_table xor_funcs = { + .number = XLO_CRYPT_XOR, + .transfer = transfer_xor, + .init = xor_init +}; /* xfer_funcs[0] is special - its release function is never called */ -static struct xloop_func_table *xfer_funcs[MAX_XLO_CRYPT] = { &none_funcs, &xor_funcs }; +static struct xloop_func_table *xfer_funcs[MAX_XLO_CRYPT] = { + &none_funcs, + &xor_funcs +}; static loff_t get_xloop_size(struct xloop_device *xlo, struct file *file) { @@ -169,7 +215,7 @@ static void __xloop_update_dio(struct xloop_device *xlo, bool dio) struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; unsigned short sb_bsize = 0; - unsigned int dio_align = 0; + unsigned dio_align = 0; bool use_dio; if (inode->i_sb->s_bdev) { @@ -188,8 +234,10 @@ static void __xloop_update_dio(struct xloop_device *xlo, bool dio) * of requests in sane applications should be PAGE_SIZE aligned */ if (dio) { - if (queue_logical_block_size(xlo->xlo_queue) >= sb_bsize && !(xlo->xlo_offset & dio_align) && - mapping->a_ops->direct_IO && !xlo->transfer) + if (queue_logical_block_size(xlo->xlo_queue) >= sb_bsize && + !(xlo->xlo_offset & dio_align) && + mapping->a_ops->direct_IO && + !xlo->transfer) use_dio = true; else use_dio = false; @@ -205,7 +253,7 @@ static void __xloop_update_dio(struct xloop_device *xlo, bool dio) /* * The flag of XLO_FLAGS_DIRECT_IO is handled similarly with - * XLO_FLAGS_READ_ONLY, both are set from kernel, and losetup + * XLO_FLAGS_READ_ONLY, both are set from kernel, and xlosetup * will get updated by ioctl(XLOOP_GET_STATUS) */ if (xlo->xlo_state == Xlo_bound) @@ -223,18 +271,6 @@ static void __xloop_update_dio(struct xloop_device *xlo, bool dio) } /** - * xloop_validate_block_size() - validates the passed in block size - * @bsize: size to validate - */ -static int xloop_validate_block_size(unsigned short bsize) -{ - if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) - return -EINVAL; - - return 0; -} - -/** * xloop_set_size() - sets device size and notifies userspace * @xlo: struct xloop_device to set the size for * @size: new size of the xloop device @@ -244,31 +280,25 @@ static int xloop_validate_block_size(unsigned short bsize) */ static void xloop_set_size(struct xloop_device *xlo, loff_t size) { - struct block_device *bdev = xlo->xlo_device; -#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) - sector_t capacity; -#endif + if (!set_capacity_and_notify(xlo->xlo_disk, size)) + kobject_uevent(&disk_to_dev(xlo->xlo_disk)->kobj, KOBJ_CHANGE); +} -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) - bd_set_nr_sectors(bdev, size); -#else - bd_set_size(bdev, size << SECTOR_SHIFT); -#endif +static inline int +xlo_do_transfer(struct xloop_device *xlo, int cmd, + struct page *rpage, unsigned roffs, + struct page *lpage, unsigned loffs, + int size, sector_t rblock) +{ + int ret; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) - if (!set_capacity_revalidate_and_notify(xlo->xlo_disk, size, false)) - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0) - set_capacity_revalidate_and_notify(xlo->xlo_disk, size, false); -#else - capacity = get_capacity(xlo->xlo_disk); - set_capacity(xlo->xlo_disk, size); - if (capacity != size && capacity != 0 && size != 0) { - char *envp[] = { "RESIZE=1", NULL }; + ret = xlo->transfer(xlo, cmd, rpage, roffs, lpage, loffs, size, rblock); + if (likely(!ret)) + return 0; - kobject_uevent_env(&disk_to_dev(xlo->xlo_disk)->kobj, KOBJ_CHANGE, envp); - } -#endif + pr_err_ratelimited("Transfer error at byte offset %llu, length %i.\n", + (unsigned long long)rblock << 9, size); + return ret; } static void xlo_complete_rq(struct request *rq) @@ -311,9 +341,9 @@ static int do_req_filebacked(struct xloop_device *xlo, struct request *rq) /* * xlo_write_simple and xlo_read_simple should have been covered - * by io submit style function like xlo_rw_aio(), one blocker - * is that xlo_read_simple() need to call flush_dcache_page after - * the page is written from kernel, and it isn't easy to handle + * by io submit style function like xloop_file_fmt_read_aio(), one + * blocker is that xloop_file_fmt_read() need to call flush_dcache_page + * after the page is written from kernel, and it isn't easy to handle * this in io submit style function which submits all segments * of the req at one time. And direct read IO doesn't need to * run flush_dcache_page(). @@ -343,48 +373,48 @@ static int do_req_filebacked(struct xloop_device *xlo, struct request *rq) static inline void xloop_update_dio(struct xloop_device *xlo) { - __xloop_update_dio(xlo, (xlo->xlo_backing_file->f_flags & O_DIRECT) | xlo->use_dio); + __xloop_update_dio(xlo, (xlo->xlo_backing_file->f_flags & O_DIRECT) | + xlo->use_dio); } -static void xloop_reread_partitions(struct xloop_device *xlo, struct block_device *bdev) +static void xloop_reread_partitions(struct xloop_device *xlo) { int rc; -#if (KERNEL_VERSION(5, 5, 0) <= LINUX_VERSION_CODE) || \ - RHEL_CHECK_VERSION(RHEL_RELEASE_VERSION(8, 3) <= RHEL_RELEASE_CODE) - mutex_lock(&bdev->bd_mutex); - rc = bdev_disk_changed(bdev, false); - mutex_unlock(&bdev->bd_mutex); -#else - rc = blkdev_reread_part(bdev); -#endif + mutex_lock(&xlo->xlo_disk->open_mutex); + rc = bdev_disk_changed(xlo->xlo_disk, false); + mutex_unlock(&xlo->xlo_disk->open_mutex); if (rc) - dev_warn(xloop_device_to_dev(xlo), "partition scan failed (rc=%d)\n", rc); + dev_warn(xloop_device_to_dev(xlo), + "partition scan of xloop%d (%s) failed (rc=%d)\n", + xlo->xlo_number, xlo->xlo_file_name, rc); } static inline int is_xloop_device(struct file *file) { struct inode *i = file->f_mapping->host; - return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == XLOOP_MAJOR; + return i && S_ISBLK(i->i_mode) && imajor(i) == XLOOP_MAJOR; } static int xloop_validate_file(struct file *file, struct block_device *bdev) { - struct inode *inode = file->f_mapping->host; - struct file *f = file; + struct inode *inode = file->f_mapping->host; + struct file *f = file; /* Avoid recursion */ while (is_xloop_device(f)) { struct xloop_device *l; - if (f->f_mapping->host->i_bdev == bdev) + lockdep_assert_held(&xloop_validate_mutex); + if (f->f_mapping->host->i_rdev == bdev->bd_dev) return -EBADF; - l = f->f_mapping->host->i_bdev->bd_disk->private_data; + l = I_BDEV(f->f_mapping->host)->bd_disk->private_data; if (l->xlo_state != Xlo_bound) return -EINVAL; - + /* Order wrt setting xlo->xlo_backing_file in xloop_configure(). */ + rmb(); f = l->xlo_backing_file; } if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) @@ -400,15 +430,21 @@ static int xloop_validate_file(struct file *file, struct block_device *bdev) * This can only work if the xloop device is used read-only, and if the * new backing store is the same size and type as the old backing store. */ -static int xloop_change_fd(struct xloop_device *xlo, struct block_device *bdev, unsigned int arg) +static int xloop_change_fd(struct xloop_device *xlo, struct block_device *bdev, + unsigned int arg) { - struct file *file = NULL, *old_file; + struct file *file = fget(arg); + struct file *old_file; int error; bool partscan; + bool is_xloop; - error = mutex_lock_killable(&xloop_ctl_mutex); + if (!file) + return -EBADF; + is_xloop = is_xloop_device(file); + error = xloop_global_lock_killable(xlo, is_xloop); if (error) - return error; + goto out_putf; error = -ENXIO; if (xlo->xlo_state != Xlo_bound) goto out_err; @@ -418,11 +454,6 @@ static int xloop_change_fd(struct xloop_device *xlo, struct block_device *bdev, if (!(xlo->xlo_flags & XLO_FLAGS_READ_ONLY)) goto out_err; - error = -EBADF; - file = fget(arg); - if (!file) - goto out_err; - error = xloop_validate_file(file, bdev); if (error) goto out_err; @@ -436,35 +467,47 @@ static int xloop_change_fd(struct xloop_device *xlo, struct block_device *bdev, goto out_err; /* and ... switch */ + disk_force_media_change(xlo->xlo_disk, DISK_EVENT_MEDIA_CHANGE); blk_mq_freeze_queue(xlo->xlo_queue); mapping_set_gfp_mask(old_file->f_mapping, xlo->old_gfp_mask); xlo->xlo_backing_file = file; xlo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); - mapping_set_gfp_mask(file->f_mapping, xlo->old_gfp_mask & ~(__GFP_IO | __GFP_FS)); + mapping_set_gfp_mask(file->f_mapping, + xlo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); xloop_update_dio(xlo); blk_mq_unfreeze_queue(xlo->xlo_queue); partscan = xlo->xlo_flags & XLO_FLAGS_PARTSCAN; - mutex_unlock(&xloop_ctl_mutex); + xloop_global_unlock(xlo, is_xloop); + + /* + * Flush xloop_validate_file() before fput(), for l->xlo_backing_file + * might be pointing at old_file which might be the last reference. + */ + if (!is_xloop) { + mutex_lock(&xloop_validate_mutex); + mutex_unlock(&xloop_validate_mutex); + } /* - * We must drop file reference outside of xloop_ctl_mutex as dropping - * the file ref can take bd_mutex which creates circular locking + * We must drop file reference outside of xlo_mutex as dropping + * the file ref can take open_mutex which creates circular locking * dependency. */ fput(old_file); if (partscan) - xloop_reread_partitions(xlo, bdev); + xloop_reread_partitions(xlo); return 0; out_err: - mutex_unlock(&xloop_ctl_mutex); - if (file) - fput(file); + xloop_global_unlock(xlo, is_xloop); +out_putf: + fput(file); return error; } /* xloop sysfs attributes */ -static ssize_t xloop_attr_show(struct device *dev, char *page, ssize_t (*callback)(struct xloop_device *, char *)) +static ssize_t xloop_attr_show(struct device *dev, char *page, + ssize_t (*callback)(struct xloop_device *, char *)) { struct gendisk *disk = dev_to_disk(dev); struct xloop_device *xlo = disk->private_data; @@ -472,13 +515,15 @@ static ssize_t xloop_attr_show(struct device *dev, char *page, ssize_t (*callbac return callback(xlo, page); } -#define XLOOP_ATTR_RO(_name) \ - static ssize_t xloop_attr_##_name##_show(struct xloop_device *, char *); \ - static ssize_t xloop_attr_do_show_##_name(struct device *d, struct device_attribute *attr, char *b) \ - { \ - return xloop_attr_show(d, b, xloop_attr_##_name##_show); \ - } \ - static struct device_attribute xloop_attr_##_name = __ATTR(_name, 0444, xloop_attr_do_show_##_name, NULL); +#define XLOOP_ATTR_RO(_name) \ +static ssize_t xloop_attr_##_name##_show(struct xloop_device *, char *); \ +static ssize_t xloop_attr_do_show_##_name(struct device *d, \ + struct device_attribute *attr, char *b) \ +{ \ + return xloop_attr_show(d, b, xloop_attr_##_name##_show); \ +} \ +static struct device_attribute xloop_attr_##_name = \ + __ATTR(_name, 0444, xloop_attr_do_show_##_name, NULL); static ssize_t xloop_attr_backing_file_show(struct xloop_device *xlo, char *buf) { @@ -552,26 +597,32 @@ XLOOP_ATTR_RO(partscan); XLOOP_ATTR_RO(dio); static struct attribute *xloop_attrs[] = { - &xloop_attr_backing_file.attr, &xloop_attr_file_fmt_type.attr, - &xloop_attr_offset.attr, &xloop_attr_sizelimit.attr, - &xloop_attr_autoclear.attr, &xloop_attr_partscan.attr, - &xloop_attr_dio.attr, NULL, + &xloop_attr_backing_file.attr, + &xloop_attr_file_fmt_type.attr, + &xloop_attr_offset.attr, + &xloop_attr_sizelimit.attr, + &xloop_attr_autoclear.attr, + &xloop_attr_partscan.attr, + &xloop_attr_dio.attr, + NULL, }; static struct attribute_group xloop_attribute_group = { - .name = "xloop", - .attrs = xloop_attrs, + .name = "xloop", + .attrs= xloop_attrs, }; static void xloop_sysfs_init(struct xloop_device *xlo) { - xlo->sysfs_inited = !sysfs_create_group(&disk_to_dev(xlo->xlo_disk)->kobj, &xloop_attribute_group); + xlo->sysfs_inited = !sysfs_create_group(&disk_to_dev(xlo->xlo_disk)->kobj, + &xloop_attribute_group); } static void xloop_sysfs_exit(struct xloop_device *xlo) { if (xlo->sysfs_inited) - sysfs_remove_group(&disk_to_dev(xlo->xlo_disk)->kobj, &xloop_attribute_group); + sysfs_remove_group(&disk_to_dev(xlo->xlo_disk)->kobj, + &xloop_attribute_group); } static void xloop_config_discard(struct xloop_device *xlo) @@ -589,12 +640,11 @@ static void xloop_config_discard(struct xloop_device *xlo) * file-backed xloop devices: discarded regions read back as zero. */ if (S_ISBLK(inode->i_mode) && !xlo->xlo_encrypt_key_size) { - struct request_queue *backingq; - - backingq = bdev_get_queue(inode->i_bdev); + struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); max_discard_sectors = backingq->limits.max_write_zeroes_sectors; - granularity = backingq->limits.discard_granularity ?: queue_physical_block_size(backingq); + granularity = backingq->limits.discard_granularity ?: + queue_physical_block_size(backingq); /* * We use punch hole to reclaim the free space used by the @@ -625,31 +675,100 @@ static void xloop_config_discard(struct xloop_device *xlo) q->limits.discard_alignment = 0; } -static void xloop_unprepare_queue(struct xloop_device *xlo) -{ - kthread_flush_worker(&xlo->worker); - kthread_stop(xlo->worker_task); -} +struct xloop_worker { + struct rb_node rb_node; + struct work_struct work; + struct list_head cmd_list; + struct list_head idle_list; + struct xloop_device *xlo; + struct cgroup_subsys_state *blkcg_css; + unsigned long last_ran_at; +}; + +static void xloop_workfn(struct work_struct *work); +static void xloop_rootcg_workfn(struct work_struct *work); +static void xloop_free_idle_workers(struct timer_list *timer); -static int xloop_kthread_worker_fn(void *worker_ptr) +#ifdef CONFIG_BLK_CGROUP +static inline int queue_on_root_worker(struct cgroup_subsys_state *css) { -#if (KERNEL_VERSION(5, 8, 0) <= LINUX_VERSION_CODE) || \ - RHEL_CHECK_VERSION(RHEL_RELEASE_VERSION(8, 4) <= RHEL_RELEASE_CODE) - current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; + return !css || css == blkcg_root_css; +} #else - current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO; -#endif - return kthread_worker_fn(worker_ptr); +static inline int queue_on_root_worker(struct cgroup_subsys_state *css) +{ + return !css; } +#endif -static int xloop_prepare_queue(struct xloop_device *xlo) +static void xloop_queue_work(struct xloop_device *xlo, struct xloop_cmd *cmd) { - kthread_init_worker(&xlo->worker); - xlo->worker_task = kthread_run(xloop_kthread_worker_fn, &xlo->worker, "xloop%d", xlo->xlo_number); - if (IS_ERR(xlo->worker_task)) - return -ENOMEM; - set_user_nice(xlo->worker_task, MIN_NICE); - return 0; + struct rb_node **node = &(xlo->worker_tree.rb_node), *parent = NULL; + struct xloop_worker *cur_worker, *worker = NULL; + struct work_struct *work; + struct list_head *cmd_list; + + spin_lock_irq(&xlo->xlo_work_lock); + + if (queue_on_root_worker(cmd->blkcg_css)) + goto queue_work; + + node = &xlo->worker_tree.rb_node; + + while (*node) { + parent = *node; + cur_worker = container_of(*node, struct xloop_worker, rb_node); + if (cur_worker->blkcg_css == cmd->blkcg_css) { + worker = cur_worker; + break; + } else if ((long)cur_worker->blkcg_css < (long)cmd->blkcg_css) { + node = &(*node)->rb_left; + } else { + node = &(*node)->rb_right; + } + } + if (worker) + goto queue_work; + + worker = kzalloc(sizeof(struct xloop_worker), GFP_NOWAIT | __GFP_NOWARN); + /* + * In the event we cannot allocate a worker, just queue on the + * rootcg worker and issue the I/O as the rootcg + */ + if (!worker) { + cmd->blkcg_css = NULL; + if (cmd->memcg_css) + css_put(cmd->memcg_css); + cmd->memcg_css = NULL; + goto queue_work; + } + + worker->blkcg_css = cmd->blkcg_css; + css_get(worker->blkcg_css); + INIT_WORK(&worker->work, xloop_workfn); + INIT_LIST_HEAD(&worker->cmd_list); + INIT_LIST_HEAD(&worker->idle_list); + worker->xlo = xlo; + rb_link_node(&worker->rb_node, parent, node); + rb_insert_color(&worker->rb_node, &xlo->worker_tree); +queue_work: + if (worker) { + /* + * We need to remove from the idle list here while + * holding the lock so that the idle timer doesn't + * free the worker + */ + if (!list_empty(&worker->idle_list)) + list_del_init(&worker->idle_list); + work = &worker->work; + cmd_list = &worker->cmd_list; + } else { + work = &xlo->rootcg_work; + cmd_list = &xlo->rootcg_cmd_list; + } + list_add_tail(&cmd->list_entry, cmd_list); + queue_work(xlo->workqueue, work); + spin_unlock_irq(&xlo->xlo_work_lock); } static void xloop_update_rotational(struct xloop_device *xlo) @@ -670,7 +789,8 @@ static void xloop_update_rotational(struct xloop_device *xlo) blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); } -static int xloop_release_xfer(struct xloop_device *xlo) +static int +xloop_release_xfer(struct xloop_device *xlo) { int err = 0; struct xloop_func_table *xfer = xlo->xlo_encryption; @@ -685,7 +805,9 @@ static int xloop_release_xfer(struct xloop_device *xlo) return err; } -static int xloop_init_xfer(struct xloop_device *xlo, struct xloop_func_table *xfer, const struct xloop_info64 *i) +static int +xloop_init_xfer(struct xloop_device *xlo, struct xloop_func_table *xfer, + const struct xloop_info64 *i) { int err = 0; @@ -712,13 +834,15 @@ static int xloop_init_xfer(struct xloop_device *xlo, struct xloop_func_table *xf * Configures the xloop device parameters according to the passed * in xloop_info64 configuration. */ -static int xloop_set_status_from_info(struct xloop_device *xlo, const struct xloop_info64 *info) +static int +xloop_set_status_from_info(struct xloop_device *xlo, + const struct xloop_info64 *info) { int err; struct xloop_func_table *xfer; kuid_t uid = current_uid(); - if ((unsigned int)info->xlo_encrypt_key_size > XLO_KEY_SIZE) + if ((unsigned int) info->xlo_encrypt_key_size > XLO_KEY_SIZE) return -EINVAL; err = xloop_release_xfer(xlo); @@ -744,8 +868,8 @@ static int xloop_set_status_from_info(struct xloop_device *xlo, const struct xlo xlo->xlo_sizelimit = info->xlo_sizelimit; memcpy(xlo->xlo_file_name, info->xlo_file_name, XLO_NAME_SIZE); memcpy(xlo->xlo_crypt_name, info->xlo_crypt_name, XLO_NAME_SIZE); - xlo->xlo_file_name[XLO_NAME_SIZE - 1] = 0; - xlo->xlo_crypt_name[XLO_NAME_SIZE - 1] = 0; + xlo->xlo_file_name[XLO_NAME_SIZE-1] = 0; + xlo->xlo_crypt_name[XLO_NAME_SIZE-1] = 0; if (!xfer) xfer = &none_funcs; @@ -758,60 +882,45 @@ static int xloop_set_status_from_info(struct xloop_device *xlo, const struct xlo xlo->xlo_init[0] = info->xlo_init[0]; xlo->xlo_init[1] = info->xlo_init[1]; if (info->xlo_encrypt_key_size) { - memcpy(xlo->xlo_encrypt_key, info->xlo_encrypt_key, info->xlo_encrypt_key_size); + memcpy(xlo->xlo_encrypt_key, info->xlo_encrypt_key, + info->xlo_encrypt_key_size); xlo->xlo_key_owner = uid; } return 0; } -static int xloop_configure(struct xloop_device *xlo, fmode_t mode, struct block_device *bdev, - const struct xloop_config *config) +static int xloop_configure(struct xloop_device *xlo, fmode_t mode, + struct block_device *bdev, + const struct xloop_config *config) { - struct file *file; + struct file *file = fget(config->fd); struct inode *inode; struct address_space *mapping; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) - struct block_device *claimed_bdev = NULL; -#endif int error; loff_t size; bool partscan; unsigned short bsize; + bool is_xloop; + + if (!file) + return -EBADF; + is_xloop = is_xloop_device(file); /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); - error = -EBADF; - file = fget(config->fd); - if (!file) - goto out; - /* * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. */ if (!(mode & FMODE_EXCL)) { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) - claimed_bdev = bdev->bd_contains; - error = bd_prepare_to_claim(bdev, claimed_bdev, xloop_configure); - if (error) - goto out_putf; -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) - claimed_bdev = bd_start_claiming(bdev, xloop_configure); - if (IS_ERR(claimed_bdev)) { - error = PTR_ERR(claimed_bdev); - goto out_putf; - } -#else - bdgrab(bdev); - error = blkdev_get(bdev, mode | FMODE_EXCL, xloop_configure); + error = bd_prepare_to_claim(bdev, xloop_configure); if (error) goto out_putf; -#endif } - error = mutex_lock_killable(&xloop_ctl_mutex); + error = xloop_global_lock_killable(xlo, is_xloop); if (error) goto out_bdev; @@ -832,7 +941,7 @@ static int xloop_configure(struct xloop_device *xlo, fmode_t mode, struct block_ } if (config->block_size) { - error = xloop_validate_block_size(config->block_size); + error = blk_validate_block_size(config->block_size); if (error) goto out_unlock; } @@ -841,20 +950,33 @@ static int xloop_configure(struct xloop_device *xlo, fmode_t mode, struct block_ if (error) goto out_unlock; - if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || !file->f_op->write_iter) + if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || + !file->f_op->write_iter) xlo->xlo_flags |= XLO_FLAGS_READ_ONLY; - error = xloop_prepare_queue(xlo); - if (error) + xlo->workqueue = alloc_workqueue("xloop%d", + WQ_UNBOUND | WQ_FREEZABLE, + 0, + xlo->xlo_number); + if (!xlo->workqueue) { + error = -ENOMEM; goto out_unlock; + } - set_device_ro(bdev, (xlo->xlo_flags & XLO_FLAGS_READ_ONLY) != 0); + disk_force_media_change(xlo->xlo_disk, DISK_EVENT_MEDIA_CHANGE); + set_disk_ro(xlo->xlo_disk, (xlo->xlo_flags & XLO_FLAGS_READ_ONLY) != 0); + INIT_WORK(&xlo->rootcg_work, xloop_rootcg_workfn); + INIT_LIST_HEAD(&xlo->rootcg_cmd_list); + INIT_LIST_HEAD(&xlo->idle_worker_list); + xlo->worker_tree = RB_ROOT; + timer_setup(&xlo->timer, xloop_free_idle_workers, + TIMER_DEFERRABLE); xlo->use_dio = xlo->xlo_flags & XLO_FLAGS_DIRECT_IO; xlo->xlo_device = bdev; xlo->xlo_backing_file = file; xlo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, xlo->old_gfp_mask & ~(__GFP_IO | __GFP_FS)); + mapping_set_gfp_mask(mapping, xlo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); error = xloop_file_fmt_init(xlo->xlo_fmt, config->info.xlo_file_fmt_type); if (error) @@ -875,6 +997,7 @@ static int xloop_configure(struct xloop_device *xlo, fmode_t mode, struct block_ blk_queue_physical_block_size(xlo->xlo_queue, bsize); blk_queue_io_min(xlo->xlo_queue, bsize); + xloop_config_discard(xlo); xloop_update_rotational(xlo); xloop_update_dio(xlo); xloop_sysfs_init(xlo); @@ -882,7 +1005,8 @@ static int xloop_configure(struct xloop_device *xlo, fmode_t mode, struct block_ size = get_xloop_size(xlo, file); xloop_set_size(xlo, size); - set_blocksize(bdev, S_ISBLK(inode->i_mode) ? block_size(inode->i_bdev) : PAGE_SIZE); + /* Order wrt reading xlo_state in xloop_validate_file(). */ + wmb(); xlo->xlo_state = Xlo_bound; if (part_shift) @@ -891,35 +1015,20 @@ static int xloop_configure(struct xloop_device *xlo, fmode_t mode, struct block_ if (partscan) xlo->xlo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - /* Grab the block_device to prevent its destruction after we - * put /dev/xloopXX inode. Later in __xloop_clr_fd() we bdput(bdev). - */ - bdgrab(bdev); - mutex_unlock(&xloop_ctl_mutex); + xloop_global_unlock(xlo, is_xloop); if (partscan) - xloop_reread_partitions(xlo, bdev); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) - if (claimed_bdev) - bd_abort_claiming(bdev, claimed_bdev, xloop_configure); -#else + xloop_reread_partitions(xlo); if (!(mode & FMODE_EXCL)) - blkdev_put(bdev, mode | FMODE_EXCL); -#endif + bd_abort_claiming(bdev, xloop_configure); return 0; out_unlock: - mutex_unlock(&xloop_ctl_mutex); + xloop_global_unlock(xlo, is_xloop); out_bdev: -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) - if (claimed_bdev) - bd_abort_claiming(bdev, claimed_bdev, xloop_configure); -#else if (!(mode & FMODE_EXCL)) - blkdev_put(bdev, mode | FMODE_EXCL); -#endif + bd_abort_claiming(bdev, xloop_configure); out_putf: fput(file); -out: /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); return error; @@ -933,8 +1042,21 @@ static int __xloop_clr_fd(struct xloop_device *xlo, bool release) int err = 0; bool partscan = false; int xlo_number; + struct xloop_worker *pos, *worker; - mutex_lock(&xloop_ctl_mutex); + /* + * Flush xloop_configure() and xloop_change_fd(). It is acceptable for + * xloop_validate_file() to succeed, for actual clear operation has not + * started yet. + */ + mutex_lock(&xloop_validate_mutex); + mutex_unlock(&xloop_validate_mutex); + /* + * xloop_validate_file() now fails because l->xlo_state != Xlo_bound + * became visible. + */ + + mutex_lock(&xlo->xlo_mutex); if (WARN_ON_ONCE(xlo->xlo_state != Xlo_rundown)) { err = -ENXIO; goto out_unlock; @@ -946,11 +1068,26 @@ static int __xloop_clr_fd(struct xloop_device *xlo, bool release) goto out_unlock; } + if (test_bit(QUEUE_FLAG_WC, &xlo->xlo_queue->queue_flags)) + blk_queue_write_cache(xlo->xlo_queue, false, false); + /* freeze request queue during the transition */ blk_mq_freeze_queue(xlo->xlo_queue); xloop_file_fmt_exit(xlo->xlo_fmt); + destroy_workqueue(xlo->workqueue); + spin_lock_irq(&xlo->xlo_work_lock); + list_for_each_entry_safe(worker, pos, &xlo->idle_worker_list, + idle_list) { + list_del(&worker->idle_list); + rb_erase(&worker->rb_node, &xlo->worker_tree); + css_put(worker->blkcg_css); + kfree(worker); + } + spin_unlock_irq(&xlo->xlo_work_lock); + del_timer_sync(&xlo->timer); + spin_lock_irq(&xlo->xlo_lock); xlo->xlo_backing_file = NULL; spin_unlock_irq(&xlo->xlo_lock); @@ -970,18 +1107,12 @@ static int __xloop_clr_fd(struct xloop_device *xlo, bool release) blk_queue_physical_block_size(xlo->xlo_queue, 512); blk_queue_io_min(xlo->xlo_queue, 512); if (bdev) { - bdput(bdev); invalidate_bdev(bdev); bdev->bd_inode->i_mapping->wb_err = 0; } set_capacity(xlo->xlo_disk, 0); xloop_sysfs_exit(xlo); if (bdev) { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 10, 0) - bd_set_nr_sectors(bdev, 0); -#else - bd_set_size(bdev, 0); -#endif /* let user-space know about this change */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); } @@ -992,32 +1123,27 @@ static int __xloop_clr_fd(struct xloop_device *xlo, bool release) partscan = xlo->xlo_flags & XLO_FLAGS_PARTSCAN && bdev; xlo_number = xlo->xlo_number; - xloop_unprepare_queue(xlo); + disk_force_media_change(xlo->xlo_disk, DISK_EVENT_MEDIA_CHANGE); out_unlock: - mutex_unlock(&xloop_ctl_mutex); + mutex_unlock(&xlo->xlo_mutex); if (partscan) { /* - * bd_mutex has been held already in release path, so don't + * open_mutex has been held already in release path, so don't * acquire it if this function is called in such case. * * If the reread partition isn't from release path, xlo_refcnt * must be at least one and it can only become zero when the * current holder is released. */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 5, 0) || RHEL_CHECK_VERSION(RHEL_RELEASE_VERSION(8, 3) <= RHEL_RELEASE_CODE) if (!release) - mutex_lock(&bdev->bd_mutex); - err = bdev_disk_changed(bdev, false); -#else - err = blkdev_reread_part(bdev); -#endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 5, 0) + mutex_lock(&xlo->xlo_disk->open_mutex); + err = bdev_disk_changed(xlo->xlo_disk, false); if (!release) - mutex_unlock(&bdev->bd_mutex); -#endif + mutex_unlock(&xlo->xlo_disk->open_mutex); if (err) - dev_warn(xloop_device_to_dev(xlo), "partition scan failed (rc=%d)\n", err); - + dev_warn(xloop_device_to_dev(xlo), + "partition scan of xloop%d failed (rc=%d)\n", + xlo_number, err); /* Device is gone, no point in returning error */ err = 0; } @@ -1031,18 +1157,17 @@ out_unlock: * protects us from all the other places trying to change the 'xlo' * device. */ - mutex_lock(&xloop_ctl_mutex); + mutex_lock(&xlo->xlo_mutex); xlo->xlo_flags = 0; if (!part_shift) xlo->xlo_disk->flags |= GENHD_FL_NO_PART_SCAN; xlo->xlo_state = Xlo_unbound; - mutex_unlock(&xloop_ctl_mutex); + mutex_unlock(&xlo->xlo_mutex); /* - * Need not hold xloop_ctl_mutex to fput backing file. - * Calling fput holding xloop_ctl_mutex triggers a circular - * lock dependency possibility warning as fput can take - * bd_mutex which is usually taken before xloop_ctl_mutex. + * Need not hold xlo_mutex to fput backing file. Calling fput holding + * xlo_mutex triggers a circular lock dependency possibility warning as + * fput can take open_mutex which is usually taken before xlo_mutex. */ if (filp) fput(filp); @@ -1053,11 +1178,11 @@ static int xloop_clr_fd(struct xloop_device *xlo) { int err; - err = mutex_lock_killable(&xloop_ctl_mutex); + err = mutex_lock_killable(&xlo->xlo_mutex); if (err) return err; if (xlo->xlo_state != Xlo_bound) { - mutex_unlock(&xloop_ctl_mutex); + mutex_unlock(&xlo->xlo_mutex); return -ENXIO; } /* @@ -1066,34 +1191,36 @@ static int xloop_clr_fd(struct xloop_device *xlo) * the last reference goes away. This stops $!~#$@ udev from * preventing teardown because it decided that it needs to run blkid on * the xloopback device whenever they appear. xfstests is notorious for - * failing tests because blkid via udev races with a losetup - * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d + * failing tests because blkid via udev races with a xlosetup + * <dev>/do something like mkfs/xlosetup -d <dev> causing the xlosetup -d * command to fail with EBUSY. */ if (atomic_read(&xlo->xlo_refcnt) > 1) { xlo->xlo_flags |= XLO_FLAGS_AUTOCLEAR; - mutex_unlock(&xloop_ctl_mutex); + mutex_unlock(&xlo->xlo_mutex); return 0; } xlo->xlo_state = Xlo_rundown; - mutex_unlock(&xloop_ctl_mutex); + mutex_unlock(&xlo->xlo_mutex); return __xloop_clr_fd(xlo, false); } -static int xloop_set_status(struct xloop_device *xlo, const struct xloop_info64 *info) +static int +xloop_set_status(struct xloop_device *xlo, const struct xloop_info64 *info) { int err; - struct block_device *bdev; kuid_t uid = current_uid(); int prev_xlo_flags; bool partscan = false; bool size_changed = false; - err = mutex_lock_killable(&xloop_ctl_mutex); + err = mutex_lock_killable(&xlo->xlo_mutex); if (err) return err; - if (xlo->xlo_encrypt_key_size && !uid_eq(xlo->xlo_key_owner, uid) && !capable(CAP_SYS_ADMIN)) { + if (xlo->xlo_encrypt_key_size && + !uid_eq(xlo->xlo_key_owner, uid) && + !capable(CAP_SYS_ADMIN)) { err = -EPERM; goto out_unlock; } @@ -1102,7 +1229,8 @@ static int xloop_set_status(struct xloop_device *xlo, const struct xloop_info64 goto out_unlock; } - if (xlo->xlo_offset != info->xlo_offset || xlo->xlo_sizelimit != info->xlo_sizelimit) { + if (xlo->xlo_offset != info->xlo_offset || + xlo->xlo_sizelimit != info->xlo_sizelimit) { size_changed = true; sync_blockdev(xlo->xlo_device); invalidate_bdev(xlo->xlo_device); @@ -1115,8 +1243,9 @@ static int xloop_set_status(struct xloop_device *xlo, const struct xloop_info64 /* If any pages were dirtied after invalidate_bdev(), try again */ err = -EAGAIN; dev_warn(xloop_device_to_dev(xlo), - "xloop device has still dirty pages (nrpages=%lu)\n", - xlo->xlo_device->bd_inode->i_mapping->nrpages); + "xloop%d (%s) has still dirty pages (nrpages=%lu)\n", + xlo->xlo_number, xlo->xlo_file_name, + xlo->xlo_device->bd_inode->i_mapping->nrpages); goto out_unfreeze; } @@ -1157,30 +1286,31 @@ static int xloop_set_status(struct xloop_device *xlo, const struct xloop_info64 out_unfreeze: blk_mq_unfreeze_queue(xlo->xlo_queue); - if (!err && (xlo->xlo_flags & XLO_FLAGS_PARTSCAN) && !(prev_xlo_flags & XLO_FLAGS_PARTSCAN)) { + if (!err && (xlo->xlo_flags & XLO_FLAGS_PARTSCAN) && + !(prev_xlo_flags & XLO_FLAGS_PARTSCAN)) { xlo->xlo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - bdev = xlo->xlo_device; partscan = true; } out_unlock: - mutex_unlock(&xloop_ctl_mutex); + mutex_unlock(&xlo->xlo_mutex); if (partscan) - xloop_reread_partitions(xlo, bdev); + xloop_reread_partitions(xlo); return err; } -static int xloop_get_status(struct xloop_device *xlo, struct xloop_info64 *info) +static int +xloop_get_status(struct xloop_device *xlo, struct xloop_info64 *info) { struct path path; struct kstat stat; int ret; - ret = mutex_lock_killable(&xloop_ctl_mutex); + ret = mutex_lock_killable(&xlo->xlo_mutex); if (ret) return ret; if (xlo->xlo_state != Xlo_bound) { - mutex_unlock(&xloop_ctl_mutex); + mutex_unlock(&xlo->xlo_mutex); return -ENXIO; } @@ -1191,16 +1321,18 @@ static int xloop_get_status(struct xloop_device *xlo, struct xloop_info64 *info) info->xlo_flags = xlo->xlo_flags; memcpy(info->xlo_file_name, xlo->xlo_file_name, XLO_NAME_SIZE); memcpy(info->xlo_crypt_name, xlo->xlo_crypt_name, XLO_NAME_SIZE); - info->xlo_encrypt_type = xlo->xlo_encryption ? xlo->xlo_encryption->number : 0; + info->xlo_encrypt_type = + xlo->xlo_encryption ? xlo->xlo_encryption->number : 0; if (xlo->xlo_encrypt_key_size && capable(CAP_SYS_ADMIN)) { info->xlo_encrypt_key_size = xlo->xlo_encrypt_key_size; - memcpy(info->xlo_encrypt_key, xlo->xlo_encrypt_key, xlo->xlo_encrypt_key_size); + memcpy(info->xlo_encrypt_key, xlo->xlo_encrypt_key, + xlo->xlo_encrypt_key_size); } - /* Drop xloop_ctl_mutex while we call into the filesystem. */ + /* Drop xlo_mutex while we call into the filesystem. */ path = xlo->xlo_backing_file->f_path; path_get(&path); - mutex_unlock(&xloop_ctl_mutex); + mutex_unlock(&xlo->xlo_mutex); ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); if (!ret) { info->xlo_device = huge_encode_dev(stat.dev); @@ -1211,7 +1343,8 @@ static int xloop_get_status(struct xloop_device *xlo, struct xloop_info64 *info) return ret; } -static void xloop_info64_from_old(const struct xloop_info *info, struct xloop_info64 *info64) +static void +xloop_info64_from_old(const struct xloop_info *info, struct xloop_info64 *info64) { memset(info64, 0, sizeof(*info64)); info64->xlo_number = info->xlo_number; @@ -1225,7 +1358,6 @@ static void xloop_info64_from_old(const struct xloop_info *info, struct xloop_in info64->xlo_flags = info->xlo_flags; info64->xlo_init[0] = info->xlo_init[0]; info64->xlo_init[1] = info->xlo_init[1]; - info64->xlo_file_fmt_type = info->xlo_file_fmt_type; if (info->xlo_encrypt_type == XLO_CRYPT_CRYPTOAPI) memcpy(info64->xlo_crypt_name, info->xlo_name, XLO_NAME_SIZE); else @@ -1233,7 +1365,8 @@ static void xloop_info64_from_old(const struct xloop_info *info, struct xloop_in memcpy(info64->xlo_encrypt_key, info->xlo_encrypt_key, XLO_KEY_SIZE); } -static int xloop_info64_to_old(const struct xloop_info64 *info64, struct xloop_info *info) +static int +xloop_info64_to_old(const struct xloop_info64 *info64, struct xloop_info *info) { memset(info, 0, sizeof(*info)); info->xlo_number = info64->xlo_number; @@ -1246,7 +1379,6 @@ static int xloop_info64_to_old(const struct xloop_info64 *info64, struct xloop_i info->xlo_flags = info64->xlo_flags; info->xlo_init[0] = info64->xlo_init[0]; info->xlo_init[1] = info64->xlo_init[1]; - info->xlo_file_fmt_type = info64->xlo_file_fmt_type; if (info->xlo_encrypt_type == XLO_CRYPT_CRYPTOAPI) memcpy(info->xlo_name, info64->xlo_crypt_name, XLO_NAME_SIZE); else @@ -1254,35 +1386,39 @@ static int xloop_info64_to_old(const struct xloop_info64 *info64, struct xloop_i memcpy(info->xlo_encrypt_key, info64->xlo_encrypt_key, XLO_KEY_SIZE); /* error in case values were truncated */ - if (info->xlo_device != info64->xlo_device || info->xlo_rdevice != info64->xlo_rdevice || - info->xlo_inode != info64->xlo_inode || info->xlo_offset != info64->xlo_offset) + if (info->xlo_device != info64->xlo_device || + info->xlo_rdevice != info64->xlo_rdevice || + info->xlo_inode != info64->xlo_inode || + info->xlo_offset != info64->xlo_offset) return -EOVERFLOW; return 0; } -static int xloop_set_status_old(struct xloop_device *xlo, const struct xloop_info __user *arg) +static int +xloop_set_status_old(struct xloop_device *xlo, const struct xloop_info __user *arg) { struct xloop_info info; struct xloop_info64 info64; - if (copy_from_user(&info, arg, sizeof(struct xloop_info))) + if (copy_from_user(&info, arg, sizeof (struct xloop_info))) return -EFAULT; xloop_info64_from_old(&info, &info64); return xloop_set_status(xlo, &info64); } -static int xloop_set_status64(struct xloop_device *xlo, const struct xloop_info64 __user *arg) +static int +xloop_set_status64(struct xloop_device *xlo, const struct xloop_info64 __user *arg) { struct xloop_info64 info64; - if (copy_from_user(&info64, arg, sizeof(struct xloop_info64))) + if (copy_from_user(&info64, arg, sizeof (struct xloop_info64))) return -EFAULT; return xloop_set_status(xlo, &info64); } -static int xloop_get_status_old(struct xloop_device *xlo, struct xloop_info __user *arg) -{ +static int +xloop_get_status_old(struct xloop_device *xlo, struct xloop_info __user *arg) { struct xloop_info info; struct xloop_info64 info64; int err; @@ -1298,8 +1434,8 @@ static int xloop_get_status_old(struct xloop_device *xlo, struct xloop_info __us return err; } -static int xloop_get_status64(struct xloop_device *xlo, struct xloop_info64 __user *arg) -{ +static int +xloop_get_status64(struct xloop_device *xlo, struct xloop_info64 __user *arg) { struct xloop_info64 info64; int err; @@ -1328,7 +1464,6 @@ static int xloop_set_capacity(struct xloop_device *xlo) static int xloop_set_dio(struct xloop_device *xlo, unsigned long arg) { int error = -ENXIO; - if (xlo->xlo_state != Xlo_bound) goto out; @@ -1336,7 +1471,7 @@ static int xloop_set_dio(struct xloop_device *xlo, unsigned long arg) if (xlo->use_dio == !!arg) return 0; error = -EINVAL; -out: + out: return error; } @@ -1347,7 +1482,7 @@ static int xloop_set_block_size(struct xloop_device *xlo, unsigned long arg) if (xlo->xlo_state != Xlo_bound) return -ENXIO; - err = xloop_validate_block_size(arg); + err = blk_validate_block_size(arg); if (err) return err; @@ -1363,8 +1498,9 @@ static int xloop_set_block_size(struct xloop_device *xlo, unsigned long arg) if (xlo->xlo_device->bd_inode->i_mapping->nrpages) { err = -EAGAIN; dev_warn(xloop_device_to_dev(xlo), - "xloop device has still dirty pages (nrpages=%lu)\n", - xlo->xlo_device->bd_inode->i_mapping->nrpages); + "xloop%d (%s) has still dirty pages (nrpages=%lu)\n", + xlo->xlo_number, xlo->xlo_file_name, + xlo->xlo_device->bd_inode->i_mapping->nrpages); goto out_unfreeze; } @@ -1378,11 +1514,12 @@ out_unfreeze: return err; } -static int xlo_simple_ioctl(struct xloop_device *xlo, unsigned int cmd, unsigned long arg) +static int xlo_simple_ioctl(struct xloop_device *xlo, unsigned int cmd, + unsigned long arg) { int err; - err = mutex_lock_killable(&xloop_ctl_mutex); + err = mutex_lock_killable(&xlo->xlo_mutex); if (err) return err; switch (cmd) { @@ -1398,14 +1535,15 @@ static int xlo_simple_ioctl(struct xloop_device *xlo, unsigned int cmd, unsigned default: err = xlo->ioctl ? xlo->ioctl(xlo, cmd, arg) : -EINVAL; } - mutex_unlock(&xloop_ctl_mutex); + mutex_unlock(&xlo->xlo_mutex); return err; } -static int xlo_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) +static int xlo_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) { struct xloop_device *xlo = bdev->bd_disk->private_data; - void __user *argp = (void __user *)arg; + void __user *argp = (void __user *) arg; int err; switch (cmd) { @@ -1436,15 +1574,17 @@ static int xlo_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, return xloop_clr_fd(xlo); case XLOOP_SET_STATUS: err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { err = xloop_set_status_old(xlo, argp); + } break; case XLOOP_GET_STATUS: return xloop_get_status_old(xlo, argp); case XLOOP_SET_STATUS64: err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { err = xloop_set_status64(xlo, argp); + } break; case XLOOP_GET_STATUS64: return xloop_get_status64(xlo, argp); @@ -1453,11 +1593,7 @@ static int xlo_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, case XLOOP_SET_BLOCK_SIZE: if (!(mode & FMODE_WRITE) && !capable(CAP_SYS_ADMIN)) return -EPERM; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) fallthrough; -#else - /* fall through */ -#endif default: err = xlo_simple_ioctl(xlo, cmd, arg); break; @@ -1468,26 +1604,27 @@ static int xlo_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, #ifdef CONFIG_COMPAT struct compat_xloop_info { - compat_int_t xlo_number; /* ioctl r/o */ - compat_dev_t xlo_device; /* ioctl r/o */ - compat_ulong_t xlo_inode; /* ioctl r/o */ - compat_dev_t xlo_rdevice; /* ioctl r/o */ - compat_int_t xlo_offset; - compat_int_t xlo_encrypt_type; - compat_int_t xlo_encrypt_key_size; /* ioctl w/o */ - compat_int_t xlo_flags; /* ioctl r/o */ - char xlo_name[XLO_NAME_SIZE]; - unsigned char xlo_encrypt_key[XLO_KEY_SIZE]; /* ioctl w/o */ - compat_ulong_t xlo_init[2]; - char reserved[4]; - compat_int_t xlo_file_fmt_type; + compat_int_t xlo_number; /* ioctl r/o */ + compat_dev_t xlo_device; /* ioctl r/o */ + compat_ulong_t xlo_inode; /* ioctl r/o */ + compat_dev_t xlo_rdevice; /* ioctl r/o */ + compat_int_t xlo_offset; + compat_int_t xlo_encrypt_type; + compat_int_t xlo_encrypt_key_size; /* ioctl w/o */ + compat_int_t xlo_flags; /* ioctl r/o */ + char xlo_name[XLO_NAME_SIZE]; + unsigned char xlo_encrypt_key[XLO_KEY_SIZE]; /* ioctl w/o */ + compat_ulong_t xlo_init[2]; + char reserved[4]; }; /* * Transfer 32-bit compatibility structure in userspace to 64-bit xloop info * - noinlined to reduce stack space usage in main part of driver */ -static noinline int xloop_info64_from_compat(const struct compat_xloop_info __user *arg, struct xloop_info64 *info64) +static noinline int +xloop_info64_from_compat(const struct compat_xloop_info __user *arg, + struct xloop_info64 *info64) { struct compat_xloop_info info; @@ -1506,7 +1643,6 @@ static noinline int xloop_info64_from_compat(const struct compat_xloop_info __us info64->xlo_flags = info.xlo_flags; info64->xlo_init[0] = info.xlo_init[0]; info64->xlo_init[1] = info.xlo_init[1]; - info64->xlo_file_fmt_type = info.xlo_file_fmt_type; if (info.xlo_encrypt_type == XLO_CRYPT_CRYPTOAPI) memcpy(info64->xlo_crypt_name, info.xlo_name, XLO_NAME_SIZE); else @@ -1519,7 +1655,9 @@ static noinline int xloop_info64_from_compat(const struct compat_xloop_info __us * Transfer 64-bit xloop info to 32-bit compatibility structure in userspace * - noinlined to reduce stack space usage in main part of driver */ -static noinline int xloop_info64_to_compat(const struct xloop_info64 *info64, struct compat_xloop_info __user *arg) +static noinline int +xloop_info64_to_compat(const struct xloop_info64 *info64, + struct compat_xloop_info __user *arg) { struct compat_xloop_info info; @@ -1534,7 +1672,6 @@ static noinline int xloop_info64_to_compat(const struct xloop_info64 *info64, st info.xlo_flags = info64->xlo_flags; info.xlo_init[0] = info64->xlo_init[0]; info.xlo_init[1] = info64->xlo_init[1]; - info.xlo_file_fmt_type = info64->xlo_file_fmt_type; if (info.xlo_encrypt_type == XLO_CRYPT_CRYPTOAPI) memcpy(info.xlo_name, info64->xlo_crypt_name, XLO_NAME_SIZE); else @@ -1542,10 +1679,12 @@ static noinline int xloop_info64_to_compat(const struct xloop_info64 *info64, st memcpy(info.xlo_encrypt_key, info64->xlo_encrypt_key, XLO_KEY_SIZE); /* error in case values were truncated */ - if (info.xlo_device != info64->xlo_device || info.xlo_rdevice != info64->xlo_rdevice || - info.xlo_inode != info64->xlo_inode || info.xlo_offset != info64->xlo_offset || - info.xlo_init[0] != info64->xlo_init[0] || info.xlo_init[1] != info64->xlo_init[1] || - info.xlo_file_fmt_type != info64->xlo_file_fmt_type) + if (info.xlo_device != info64->xlo_device || + info.xlo_rdevice != info64->xlo_rdevice || + info.xlo_inode != info64->xlo_inode || + info.xlo_offset != info64->xlo_offset || + info.xlo_init[0] != info64->xlo_init[0] || + info.xlo_init[1] != info64->xlo_init[1]) return -EOVERFLOW; if (copy_to_user(arg, &info, sizeof(info))) @@ -1553,7 +1692,9 @@ static noinline int xloop_info64_to_compat(const struct xloop_info64 *info64, st return 0; } -static int xloop_set_status_compat(struct xloop_device *xlo, const struct compat_xloop_info __user *arg) +static int +xloop_set_status_compat(struct xloop_device *xlo, + const struct compat_xloop_info __user *arg) { struct xloop_info64 info64; int ret; @@ -1564,7 +1705,9 @@ static int xloop_set_status_compat(struct xloop_device *xlo, const struct compat return xloop_set_status(xlo, &info64); } -static int xloop_get_status_compat(struct xloop_device *xlo, struct compat_xloop_info __user *arg) +static int +xloop_get_status_compat(struct xloop_device *xlo, + struct compat_xloop_info __user *arg) { struct xloop_info64 info64; int err; @@ -1577,29 +1720,28 @@ static int xloop_get_status_compat(struct xloop_device *xlo, struct compat_xloop return err; } -static int xlo_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) +static int xlo_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) { struct xloop_device *xlo = bdev->bd_disk->private_data; int err; - switch (cmd) { + switch(cmd) { case XLOOP_SET_STATUS: - err = xloop_set_status_compat(xlo, (const struct compat_xloop_info __user *)arg); + err = xloop_set_status_compat(xlo, + (const struct compat_xloop_info __user *)arg); break; case XLOOP_GET_STATUS: - err = xloop_get_status_compat(xlo, (struct compat_xloop_info __user *)arg); + err = xloop_get_status_compat(xlo, + (struct compat_xloop_info __user *)arg); break; case XLOOP_SET_CAPACITY: case XLOOP_CLR_FD: case XLOOP_GET_STATUS64: case XLOOP_SET_STATUS64: case XLOOP_CONFIGURE: - arg = (unsigned long)compat_ptr(arg); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 4, 0) + arg = (unsigned long) compat_ptr(arg); fallthrough; -#else - /* fall through */ -#endif case XLOOP_SET_FD: case XLOOP_CHANGE_FD: case XLOOP_SET_BLOCK_SIZE: @@ -1616,30 +1758,25 @@ static int xlo_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned in static int xlo_open(struct block_device *bdev, fmode_t mode) { - struct xloop_device *xlo; + struct xloop_device *xlo = bdev->bd_disk->private_data; int err; - err = mutex_lock_killable(&xloop_ctl_mutex); + err = mutex_lock_killable(&xlo->xlo_mutex); if (err) return err; - xlo = bdev->bd_disk->private_data; - if (!xlo) { + if (xlo->xlo_state == Xlo_deleting) err = -ENXIO; - goto out; - } - - atomic_inc(&xlo->xlo_refcnt); -out: - mutex_unlock(&xloop_ctl_mutex); + else + atomic_inc(&xlo->xlo_refcnt); + mutex_unlock(&xlo->xlo_mutex); return err; } static void xlo_release(struct gendisk *disk, fmode_t mode) { - struct xloop_device *xlo; + struct xloop_device *xlo = disk->private_data; - mutex_lock(&xloop_ctl_mutex); - xlo = disk->private_data; + mutex_lock(&xlo->xlo_mutex); if (atomic_dec_return(&xlo->xlo_refcnt)) goto out_unlock; @@ -1647,7 +1784,7 @@ static void xlo_release(struct gendisk *disk, fmode_t mode) if (xlo->xlo_state != Xlo_bound) goto out_unlock; xlo->xlo_state = Xlo_rundown; - mutex_unlock(&xloop_ctl_mutex); + mutex_unlock(&xlo->xlo_mutex); /* * In autoclear mode, stop the xloop thread * and remove configuration after last close. @@ -1664,16 +1801,16 @@ static void xlo_release(struct gendisk *disk, fmode_t mode) } out_unlock: - mutex_unlock(&xloop_ctl_mutex); + mutex_unlock(&xlo->xlo_mutex); } static const struct block_device_operations xlo_fops = { - .owner = THIS_MODULE, - .open = xlo_open, - .release = xlo_release, - .ioctl = xlo_ioctl, + .owner = THIS_MODULE, + .open = xlo_open, + .release = xlo_release, + .ioctl = xlo_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = xlo_compat_ioctl, + .compat_ioctl = xlo_compat_ioctl, #endif }; @@ -1699,37 +1836,36 @@ int xloop_register_transfer(struct xloop_func_table *funcs) xfer_funcs[n] = funcs; return 0; } -EXPORT_SYMBOL(xloop_register_transfer); - -static int unregister_transfer_cb(int id, void *ptr, void *data) -{ - struct xloop_device *xlo = ptr; - struct xloop_func_table *xfer = data; - - mutex_lock(&xloop_ctl_mutex); - if (xlo->xlo_encryption == xfer) - xloop_release_xfer(xlo); - mutex_unlock(&xloop_ctl_mutex); - return 0; -} int xloop_unregister_transfer(int number) { unsigned int n = number; struct xloop_func_table *xfer; - xfer = xfer_funcs[n]; - - if (n == 0 || n >= MAX_XLO_CRYPT || xfer == NULL) + if (n == 0 || n >= MAX_XLO_CRYPT || (xfer = xfer_funcs[n]) == NULL) return -EINVAL; + /* + * This function is called from only cleanup_cryptoxloop(). + * Given that each xloop device that has a transfer enabled holds a + * reference to the module implementing it we should never get here + * with a transfer that is set (unless forced module unloading is + * requested). Thus, check module's refcount and warn if this is + * not a clean unloading. + */ +#ifdef CONFIG_MODULE_UNLOAD + if (xfer->owner && module_refcount(xfer->owner) != -1) + pr_err("Danger! Unregistering an in use transfer function.\n"); +#endif xfer_funcs[n] = NULL; - idr_for_each(&xloop_index_idr, &unregister_transfer_cb, xfer); return 0; } + +EXPORT_SYMBOL(xloop_register_transfer); EXPORT_SYMBOL(xloop_unregister_transfer); -static blk_status_t xloop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) +static blk_status_t xloop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; struct xloop_cmd *cmd = blk_mq_rq_to_pdu(rq); @@ -1752,14 +1888,19 @@ static blk_status_t xloop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_ } /* always use the first bio's css */ + cmd->blkcg_css = NULL; + cmd->memcg_css = NULL; #ifdef CONFIG_BLK_CGROUP - if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) { - cmd->css = &bio_blkcg(rq->bio)->css; - css_get(cmd->css); - } else + if (rq->bio && rq->bio->bi_blkg) { + cmd->blkcg_css = &bio_blkcg(rq->bio)->css; +#ifdef CONFIG_MEMCG + cmd->memcg_css = + cgroup_get_e_css(cmd->blkcg_css->cgroup, + &memory_cgrp_subsys); #endif - cmd->css = NULL; - kthread_queue_work(&xlo->worker, &cmd->work); + } +#endif + xloop_queue_work(xlo, cmd); return BLK_STS_OK; } @@ -1770,54 +1911,122 @@ static void xloop_handle_cmd(struct xloop_cmd *cmd) const bool write = op_is_write(req_op(rq)); struct xloop_device *xlo = rq->q->queuedata; int ret = 0; + struct mem_cgroup *old_memcg = NULL; if (write && (xlo->xlo_flags & XLO_FLAGS_READ_ONLY)) { ret = -EIO; goto failed; } + if (cmd->blkcg_css) + kthread_associate_blkcg(cmd->blkcg_css); + if (cmd->memcg_css) + old_memcg = set_active_memcg( + mem_cgroup_from_css(cmd->memcg_css)); + ret = do_req_filebacked(xlo, rq); -failed: + + if (cmd->blkcg_css) + kthread_associate_blkcg(NULL); + + if (cmd->memcg_css) { + set_active_memcg(old_memcg); + css_put(cmd->memcg_css); + } + failed: /* complete non-aio request */ if (!cmd->use_aio || ret) { if (ret == -EOPNOTSUPP) cmd->ret = ret; else cmd->ret = ret ? -EIO : 0; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0) if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); -#else - blk_mq_complete_request(rq); -#endif } } -static void xloop_queue_work(struct kthread_work *work) +static void xloop_set_timer(struct xloop_device *xlo) +{ + timer_reduce(&xlo->timer, jiffies + XLOOP_IDLE_WORKER_TIMEOUT); +} + +static void xloop_process_work(struct xloop_worker *worker, + struct list_head *cmd_list, struct xloop_device *xlo) +{ + int orig_flags = current->flags; + struct xloop_cmd *cmd; + + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; + spin_lock_irq(&xlo->xlo_work_lock); + while (!list_empty(cmd_list)) { + cmd = container_of( + cmd_list->next, struct xloop_cmd, list_entry); + list_del(cmd_list->next); + spin_unlock_irq(&xlo->xlo_work_lock); + + xloop_handle_cmd(cmd); + cond_resched(); + + spin_lock_irq(&xlo->xlo_work_lock); + } + + /* + * We only add to the idle list if there are no pending cmds + * *and* the worker will not run again which ensures that it + * is safe to free any worker on the idle list + */ + if (worker && !work_pending(&worker->work)) { + worker->last_ran_at = jiffies; + list_add_tail(&worker->idle_list, &xlo->idle_worker_list); + xloop_set_timer(xlo); + } + spin_unlock_irq(&xlo->xlo_work_lock); + current->flags = orig_flags; +} + +static void xloop_workfn(struct work_struct *work) { - struct xloop_cmd *cmd = container_of(work, struct xloop_cmd, work); + struct xloop_worker *worker = + container_of(work, struct xloop_worker, work); + xloop_process_work(worker, &worker->cmd_list, worker->xlo); +} - xloop_handle_cmd(cmd); +static void xloop_rootcg_workfn(struct work_struct *work) +{ + struct xloop_device *xlo = + container_of(work, struct xloop_device, rootcg_work); + xloop_process_work(NULL, &xlo->rootcg_cmd_list, xlo); } -static int xloop_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, - unsigned int numa_node) +static void xloop_free_idle_workers(struct timer_list *timer) { - struct xloop_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct xloop_device *xlo = container_of(timer, struct xloop_device, timer); + struct xloop_worker *pos, *worker; - kthread_init_work(&cmd->work, xloop_queue_work); - return 0; + spin_lock_irq(&xlo->xlo_work_lock); + list_for_each_entry_safe(worker, pos, &xlo->idle_worker_list, + idle_list) { + if (time_is_after_jiffies(worker->last_ran_at + + XLOOP_IDLE_WORKER_TIMEOUT)) + break; + list_del(&worker->idle_list); + rb_erase(&worker->rb_node, &xlo->worker_tree); + css_put(worker->blkcg_css); + kfree(worker); + } + if (!list_empty(&xlo->idle_worker_list)) + xloop_set_timer(xlo); + spin_unlock_irq(&xlo->xlo_work_lock); } static const struct blk_mq_ops xloop_mq_ops = { - .queue_rq = xloop_queue_rq, - .init_request = xloop_init_request, - .complete = xlo_complete_rq, + .queue_rq = xloop_queue_rq, + .complete = xlo_complete_rq, }; static struct dentry *xloop_dbgfs_dir; -static int xloop_add(struct xloop_device **l, int i) +static int xloop_add(int i) { struct xloop_device *xlo; struct gendisk *disk; @@ -1827,9 +2036,12 @@ static int xloop_add(struct xloop_device **l, int i) xlo = kzalloc(sizeof(*xlo), GFP_KERNEL); if (!xlo) goto out; - xlo->xlo_state = Xlo_unbound; + err = mutex_lock_killable(&xloop_ctl_mutex); + if (err) + goto out_free_dev; + /* allocate id, if @id >= 0, we're requesting that specific id */ if (i >= 0) { err = idr_alloc(&xloop_index_idr, xlo, i, i + 1, GFP_KERNEL); @@ -1838,6 +2050,7 @@ static int xloop_add(struct xloop_device **l, int i) } else { err = idr_alloc(&xloop_index_idr, xlo, 0, 0, GFP_KERNEL); } + mutex_unlock(&xloop_ctl_mutex); if (err < 0) goto out_free_dev; i = err; @@ -1848,23 +2061,20 @@ static int xloop_add(struct xloop_device **l, int i) xlo->tag_set.queue_depth = 128; xlo->tag_set.numa_node = NUMA_NO_NODE; xlo->tag_set.cmd_size = sizeof(struct xloop_cmd); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 8, 0) - xlo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; -#else - xlo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; -#endif + xlo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | + BLK_MQ_F_NO_SCHED_BY_DEFAULT; xlo->tag_set.driver_data = xlo; err = blk_mq_alloc_tag_set(&xlo->tag_set); if (err) goto out_free_idr; - xlo->xlo_queue = blk_mq_init_queue(&xlo->tag_set); - if (IS_ERR(xlo->xlo_queue)) { - err = PTR_ERR(xlo->xlo_queue); + disk = xlo->xlo_disk = blk_mq_alloc_disk(&xlo->tag_set, xlo); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out_cleanup_tags; } - xlo->xlo_queue->queuedata = xlo; + xlo->xlo_queue = xlo->xlo_disk->queue; blk_queue_max_hw_sectors(xlo->xlo_queue, BLK_DEF_MAX_SECTORS); @@ -1879,15 +2089,10 @@ static int xloop_add(struct xloop_device **l, int i) err = -ENOMEM; xlo->xlo_fmt = xloop_file_fmt_alloc(); if (!xlo->xlo_fmt) - goto out_free_queue; + goto out_cleanup_disk; xloop_file_fmt_set_xlo(xlo->xlo_fmt, xlo); - err = -ENOMEM; - disk = xlo->xlo_disk = alloc_disk(1 << part_shift); - if (!disk) - goto out_free_file_fmt; - /* * Disable partition scanning by default. The in-kernel partition * scanning can be requested individually per-device during its @@ -1910,21 +2115,30 @@ static int xloop_add(struct xloop_device **l, int i) disk->flags |= GENHD_FL_NO_PART_SCAN; disk->flags |= GENHD_FL_EXT_DEVT; atomic_set(&xlo->xlo_refcnt, 0); + mutex_init(&xlo->xlo_mutex); xlo->xlo_number = i; spin_lock_init(&xlo->xlo_lock); + spin_lock_init(&xlo->xlo_work_lock); disk->major = XLOOP_MAJOR; disk->first_minor = i << part_shift; + disk->minors = 1 << part_shift; disk->fops = &xlo_fops; disk->private_data = xlo; disk->queue = xlo->xlo_queue; + disk->events = DISK_EVENT_MEDIA_CHANGE; + disk->event_flags = DISK_EVENT_FLAG_UEVENT; sprintf(disk->disk_name, "xloop%d", i); + /* Make this xloop device reachable from pathname. */ add_disk(disk); - *l = xlo; + /* Show this xloop device. */ + mutex_lock(&xloop_ctl_mutex); + xlo->idr_visible = true; + mutex_unlock(&xloop_ctl_mutex); /* * initialize debugfs entries * - * create for each loop device a debugfs directory under 'loop' if + * create for each xloop device a debugfs directory under 'xloop' if * the 'block' directory exists, otherwise create the loop directory in * the root directory */ @@ -1938,16 +2152,18 @@ static int xloop_add(struct xloop_device **l, int i) } #endif - return xlo->xlo_number; + return i; out_free_file_fmt: xloop_file_fmt_free(xlo->xlo_fmt); -out_free_queue: - blk_cleanup_queue(xlo->xlo_queue); +out_cleanup_disk: + blk_cleanup_disk(xlo->xlo_disk); out_cleanup_tags: blk_mq_free_tag_set(&xlo->tag_set); out_free_idr: + mutex_lock(&xloop_ctl_mutex); idr_remove(&xloop_index_idr, i); + mutex_unlock(&xloop_ctl_mutex); out_free_dev: kfree(xlo); out: @@ -1958,129 +2174,122 @@ static void xloop_remove(struct xloop_device *xlo) { xloop_file_fmt_free(xlo->xlo_fmt); debugfs_remove(xlo->xlo_dbgfs_dir); + /* Make this xloop device unreachable from pathname. */ del_gendisk(xlo->xlo_disk); - blk_cleanup_queue(xlo->xlo_queue); + blk_cleanup_disk(xlo->xlo_disk); blk_mq_free_tag_set(&xlo->tag_set); - put_disk(xlo->xlo_disk); + mutex_lock(&xloop_ctl_mutex); + idr_remove(&xloop_index_idr, xlo->xlo_number); + mutex_unlock(&xloop_ctl_mutex); + /* There is no route which can find this xloop device. */ + mutex_destroy(&xlo->xlo_mutex); kfree(xlo); } -static int find_free_cb(int id, void *ptr, void *data) +static void xloop_probe(dev_t dev) { - struct xloop_device *xlo = ptr; - struct xloop_device **l = data; + int idx = MINOR(dev) >> part_shift; - if (xlo->xlo_state == Xlo_unbound) { - *l = xlo; - return 1; - } - return 0; + if (max_xloop && idx >= max_xloop) + return; + xloop_add(idx); } -static int xloop_lookup(struct xloop_device **l, int i) +static int xloop_control_remove(int idx) { struct xloop_device *xlo; - int ret = -ENODEV; - - if (i < 0) { - int err; + int ret; - err = idr_for_each(&xloop_index_idr, &find_free_cb, &xlo); - if (err == 1) { - *l = xlo; - ret = xlo->xlo_number; - } - goto out; + if (idx < 0) { + pr_warn("deleting an unspecified xloop device is not supported.\n"); + return -EINVAL; } + + /* Hide this xloop device for serialization. */ + ret = mutex_lock_killable(&xloop_ctl_mutex); + if (ret) + return ret; + xlo = idr_find(&xloop_index_idr, idx); + if (!xlo || !xlo->idr_visible) + ret = -ENODEV; + else + xlo->idr_visible = false; + mutex_unlock(&xloop_ctl_mutex); + if (ret) + return ret; - /* lookup and return a specific i */ - xlo = idr_find(&xloop_index_idr, i); - if (xlo) { - *l = xlo; - ret = xlo->xlo_number; + /* Check whether this xloop device can be removed. */ + ret = mutex_lock_killable(&xlo->xlo_mutex); + if (ret) + goto mark_visible; + if (xlo->xlo_state != Xlo_unbound || + atomic_read(&xlo->xlo_refcnt) > 0) { + mutex_unlock(&xlo->xlo_mutex); + ret = -EBUSY; + goto mark_visible; } -out: - return ret; -} + /* Mark this xloop device no longer open()-able. */ + xlo->xlo_state = Xlo_deleting; + mutex_unlock(&xlo->xlo_mutex); -static struct kobject *xloop_probe(dev_t dev, int *part, void *data) -{ - struct xloop_device *xlo; - struct kobject *kobj; - int err; + xloop_remove(xlo); + return 0; +mark_visible: + /* Show this xloop device again. */ mutex_lock(&xloop_ctl_mutex); - err = xloop_lookup(&xlo, MINOR(dev) >> part_shift); - if (err < 0) - err = xloop_add(&xlo, MINOR(dev) >> part_shift); - if (err < 0) - kobj = NULL; - else - kobj = get_disk_and_module(xlo->xlo_disk); + xlo->idr_visible = true; mutex_unlock(&xloop_ctl_mutex); - - *part = 0; - return kobj; + return ret; } -static long xloop_control_ioctl(struct file *file, unsigned int cmd, unsigned long parm) +static int xloop_control_get_free(int idx) { struct xloop_device *xlo; - int ret; + int id, ret; ret = mutex_lock_killable(&xloop_ctl_mutex); if (ret) return ret; + idr_for_each_entry(&xloop_index_idr, xlo, id) { + /* Hitting a race results in creating a new xloop device which is harmless. */ + if (xlo->idr_visible && data_race(xlo->xlo_state) == Xlo_unbound) + goto found; + } + mutex_unlock(&xloop_ctl_mutex); + return xloop_add(-1); +found: + mutex_unlock(&xloop_ctl_mutex); + return id; +} - ret = -EINVAL; +static long xloop_control_ioctl(struct file *file, unsigned int cmd, + unsigned long parm) +{ switch (cmd) { case XLOOP_CTL_ADD: - ret = xloop_lookup(&xlo, parm); - if (ret >= 0) { - ret = -EEXIST; - break; - } - ret = xloop_add(&xlo, parm); - break; + return xloop_add(parm); case XLOOP_CTL_REMOVE: - ret = xloop_lookup(&xlo, parm); - if (ret < 0) - break; - if (xlo->xlo_state != Xlo_unbound) { - ret = -EBUSY; - break; - } - if (atomic_read(&xlo->xlo_refcnt) > 0) { - ret = -EBUSY; - break; - } - xlo->xlo_disk->private_data = NULL; - idr_remove(&xloop_index_idr, xlo->xlo_number); - xloop_remove(xlo); - break; + return xloop_control_remove(parm); case XLOOP_CTL_GET_FREE: - ret = xloop_lookup(&xlo, -1); - if (ret >= 0) - break; - ret = xloop_add(&xlo, -1); + return xloop_control_get_free(parm); + default: + return -ENOSYS; } - mutex_unlock(&xloop_ctl_mutex); - - return ret; } static const struct file_operations xloop_ctl_fops = { - .open = nonseekable_open, - .unlocked_ioctl = xloop_control_ioctl, - .compat_ioctl = xloop_control_ioctl, - .owner = THIS_MODULE, - .llseek = noop_llseek, + .open = nonseekable_open, + .unlocked_ioctl = xloop_control_ioctl, + .compat_ioctl = xloop_control_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, }; static struct miscdevice xloop_misc = { - .minor = XLOOP_CTRL_MINOR, - .name = "xloop-control", - .fops = &xloop_ctl_fops, + .minor = XLOOP_CTRL_MINOR, + .name = "xloop-control", + .fops = &xloop_ctl_fops, }; MODULE_ALIAS_MISCDEV(XLOOP_CTRL_MINOR); @@ -2089,8 +2298,6 @@ MODULE_ALIAS("devname:xloop-control"); static int __init xloop_init(void) { int i, nr; - unsigned long range; - struct xloop_device *xlo; int err; part_shift = 0; @@ -2122,20 +2329,21 @@ static int __init xloop_init(void) * If max_xloop is specified, create that many devices upfront. * This also becomes a hard limit. If max_xloop is not specified, * create CONFIG_BLK_DEV_XLOOP_MIN_COUNT xloop devices at module - * init time. xloop devices can be requested on-demand with the + * init time. Loop devices can be requested on-demand with the * /dev/xloop-control interface, or be instantiated by accessing * a 'dead' device node. */ - if (max_xloop) { + if (max_xloop) + nr = max_xloop; + else nr = CONFIG_BLK_DEV_XLOOP_MIN_COUNT; - range = 1UL << MINORBITS; - } err = misc_register(&xloop_misc); if (err < 0) goto err_out; - if (register_blkdev(XLOOP_MAJOR, "xloop")) { + + if (__register_blkdev(XLOOP_MAJOR, "xloop", xloop_probe)) { err = -EIO; goto misc_out; } @@ -2144,57 +2352,50 @@ static int __init xloop_init(void) xloop_dbgfs_dir = debugfs_create_dir("xloop", NULL); if (IS_ERR_OR_NULL(xloop_dbgfs_dir)) { err = -ENODEV; - goto misc_out; + goto blkdev_out; } #endif - blk_register_region(MKDEV(XLOOP_MAJOR, 0), range, THIS_MODULE, xloop_probe, NULL, NULL); - /* pre-create number of devices given by config or max_xloop */ - mutex_lock(&xloop_ctl_mutex); for (i = 0; i < nr; i++) - xloop_add(&xlo, i); - mutex_unlock(&xloop_ctl_mutex); + xloop_add(i); pr_info("module in version %s loaded\n", XLOOP_VERSION); return 0; +#ifdef CONFIG_DEBUG_FS +blkdev_out: + unregister_blkdev(XLOOP_MAJOR, "xloop"); +#endif misc_out: misc_deregister(&xloop_misc); err_out: return err; } -static int xloop_exit_cb(int id, void *ptr, void *data) -{ - struct xloop_device *xlo = ptr; - - xloop_remove(xlo); - return 0; -} - static void __exit xloop_exit(void) { - unsigned long range; + struct xloop_device *xlo; + int id; - range = max_xloop ? max_xloop << part_shift : 1UL << MINORBITS; + unregister_blkdev(XLOOP_MAJOR, "xloop"); + misc_deregister(&xloop_misc); - mutex_lock(&xloop_ctl_mutex); + /* + * There is no need to use xloop_ctl_mutex here, for nobody else can + * access xloop_index_idr when this module is unloading (unless forced + * module unloading is requested). If this is not a clean unloading, + * we have no means to avoid kernel crash. + */ + idr_for_each_entry(&xloop_index_idr, xlo, id) + xloop_remove(xlo); - idr_for_each(&xloop_index_idr, &xloop_exit_cb, NULL); idr_destroy(&xloop_index_idr); - blk_unregister_region(MKDEV(XLOOP_MAJOR, 0), range); - unregister_blkdev(XLOOP_MAJOR, "xloop"); - #ifdef CONFIG_DEBUG_FS debugfs_remove(xloop_dbgfs_dir); #endif - misc_deregister(&xloop_misc); - - mutex_unlock(&xloop_ctl_mutex); - pr_info("exit module\n"); } diff --git a/src/kernel/xloop_main.h b/src/kernel/xloop_main.h index 97c2654..360f3e5 100644 --- a/src/kernel/xloop_main.h +++ b/src/kernel/xloop_main.h @@ -1,4 +1,3 @@ -/* SPDX-License-Identifier: GPL-2.0 */ /* * loop_main.h * @@ -15,7 +14,6 @@ #include <linux/blk-mq.h> #include <linux/spinlock.h> #include <linux/mutex.h> -#include <linux/kthread.h> #include "uapi_xloop.h" #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> @@ -28,6 +26,7 @@ enum { Xlo_unbound, Xlo_bound, Xlo_rundown, + Xlo_deleting, }; struct xloop_func_table; @@ -38,36 +37,41 @@ struct xloop_device { loff_t xlo_offset; loff_t xlo_sizelimit; int xlo_flags; - int (*transfer)(struct xloop_device *xlo, int cmd, struct page *raw_page, - unsigned int raw_off, struct page *xloop_page, unsigned int xloop_off, - int size, sector_t real_block); + int (*transfer)(struct xloop_device *xlo, int cmd, struct page *raw_page, unsigned raw_off, struct page *xloop_page, unsigned xloop_off, int size, sector_t real_block); char xlo_file_name[XLO_NAME_SIZE]; char xlo_crypt_name[XLO_NAME_SIZE]; char xlo_encrypt_key[XLO_KEY_SIZE]; int xlo_encrypt_key_size; struct xloop_func_table *xlo_encryption; __u32 xlo_init[2]; - kuid_t xlo_key_owner; /* Who set the key */ - int (*ioctl)(struct xloop_device *xlo, int cmd, unsigned long arg); + kuid_t xlo_key_owner; /* Who set the key */ + int (*ioctl)(struct xloop_device *xlo, int cmd, unsigned long arg); struct xloop_file_fmt *xlo_fmt; struct file *xlo_backing_file; struct block_device *xlo_device; - void *key_data; + void *key_data; gfp_t old_gfp_mask; spinlock_t xlo_lock; int xlo_state; - struct kthread_worker worker; - struct task_struct *worker_task; + spinlock_t xlo_work_lock; + struct workqueue_struct *workqueue; + struct work_struct rootcg_work; + struct list_head rootcg_cmd_list; + struct list_head idle_worker_list; + struct rb_root worker_tree; + struct timer_list timer; bool use_dio; bool sysfs_inited; struct request_queue *xlo_queue; struct blk_mq_tag_set tag_set; struct gendisk *xlo_disk; + struct mutex xlo_mutex; + bool idr_visible; #ifdef CONFIG_DEBUG_FS struct dentry *xlo_dbgfs_dir; @@ -75,30 +79,30 @@ struct xloop_device { }; struct xloop_cmd { - struct kthread_work work; - bool use_aio; /* use AIO interface to handle I/O */ - atomic_t ref; /* only for aio */ + struct list_head list_entry; + bool use_aio; /* use AIO interface to handle I/O */ + atomic_t ref; /* only for aio */ long ret; struct kiocb iocb; struct bio_vec *bvec; - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *blkcg_css; + struct cgroup_subsys_state *memcg_css; }; /* Support for loadable transfer modules */ struct xloop_func_table { - int number; /* filter type */ - int (*transfer)(struct xloop_device *xlo, int cmd, struct page *raw_page, unsigned int raw_off, - struct page *xloop_page, unsigned int xloop_off, int size, sector_t real_block); - int (*init)(struct xloop_device *xlo, const struct xloop_info64 *info); + int number; /* filter type */ + int (*transfer)(struct xloop_device *xlo, int cmd, struct page *raw_page, unsigned raw_off, struct page *xloop_page, unsigned xloop_off, int size, sector_t real_block); + int (*init)(struct xloop_device *xlo, const struct xloop_info64 *); /* release is called from xloop_unregister_transfer or clr_fd */ - int (*release)(struct xloop_device *xlo); + int (*release)(struct xloop_device *xlo); int (*ioctl)(struct xloop_device *xlo, int cmd, unsigned long arg); struct module *owner; -}; +}; extern inline struct device *xloop_device_to_dev(struct xloop_device *xlo); int xloop_register_transfer(struct xloop_func_table *funcs); -int xloop_unregister_transfer(int number); +int xloop_unregister_transfer(int number); #endif |