/* * This file is part of the Distributed Network Block Device 3 * * Copyright(c) 2019 Frederic Robra * Parts copyright 2011-2012 Johann Latocha * * This file may be licensed under the terms of of the * GNU General Public License Version 2 (the ``GPL''). * * Software distributed under the License is distributed * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either * express or implied. See the GPL for the specific language * governing rights and limitations. * * You should have received a copy of the GPL along with this * program. If not, go to http://www.gnu.org/licenses/gpl.html * or write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "dnbd3.h" #include "sysfs.h" #include "clientconfig.h" #include "net.h" #define DNBD3_CMD_REQUEUED 1 struct workqueue_struct *dnbd3_wq; DEFINE_IDR(dnbd3_index_idr); DEFINE_MUTEX(dnbd3_index_mutex); static unsigned int max_devs = NUMBER_DEVICES; static struct dnbd3_device *device; int major; static void dnbd3_requeue_cmd(struct dnbd3_cmd *cmd) { struct request *req = blk_mq_rq_from_pdu(cmd); if (!test_and_set_bit(DNBD3_CMD_REQUEUED, &cmd->flags)) { blk_mq_requeue_request(req, true); } } static int dnbd3_handle_cmd(struct dnbd3_cmd *cmd, int index) { struct request *req = blk_mq_rq_from_pdu(cmd); struct dnbd3_device *dev = cmd->dnbd3; struct dnbd3_sock *sock = NULL; int ret = -1; int i; int sock_alive = 0; debug_dev(dev, "handle request at position %lu, size %d, index %d", blk_rq_pos(req), blk_rq_bytes(req), index); // if (index >= 1) { // TODO use next server with good rtt for this request // printk(KERN_INFO "dnbd3: index is %d", index); // dev_err_ratelimited(disk_to_dev(dev->disk), "attempted send on invalid socket\n"); // blk_mq_start_request(req); // return -EINVAL; // } for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (dnbd3_is_sock_alive(dev->socks[i])) { if (index == sock_alive) { sock = &dev->socks[i]; } sock_alive++; } } if (!sock) { warn_dev(dev, "index is %d but no socket was found", index); dev_err_ratelimited(disk_to_dev(dev->disk), "attempted send on invalid socket\n"); if (sock_alive > 0) { blk_mq_update_nr_hw_queues(&dev->tag_set, sock_alive); blk_mq_start_request(req); return -EINVAL; } blk_mq_end_request(req, BLK_STS_IOERR); return -EINVAL; } cmd->status = BLK_STS_OK; mutex_lock(&sock->lock); if (unlikely(!sock->sock)) { mutex_unlock(&sock->lock); warn_sock(sock, "not connected"); return -EIO; } blk_mq_start_request(req); if (unlikely(sock->pending && sock->pending != req)) { dnbd3_requeue_cmd(cmd); ret = 0; goto out; } ret = dnbd3_send_request(sock, blk_mq_rq_from_pdu(cmd), cmd); if (ret == -EAGAIN) { dev_err_ratelimited(disk_to_dev(dev->disk), "request send failed, requeueing\n"); dnbd3_requeue_cmd(cmd); ret = 0; } out: mutex_unlock(&sock->lock); return ret; } static blk_status_t dnbd3_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct dnbd3_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); int ret; struct dnbd3_device *dev = cmd->dnbd3; debug_dev(dev, "queue request"); mutex_lock(&cmd->lock); clear_bit(DNBD3_CMD_REQUEUED, &cmd->flags); ret = dnbd3_handle_cmd(cmd, hctx->queue_num); if (ret < 0) { ret = BLK_STS_IOERR; } else if (!ret) { ret = BLK_STS_OK; } mutex_unlock(&cmd->lock); return ret; } static int dnbd3_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node) { struct dnbd3_cmd *cmd = blk_mq_rq_to_pdu(rq); cmd->dnbd3 = set->driver_data; cmd->flags = 0; mutex_init(&cmd->lock); return 0; } static enum blk_eh_timer_return dnbd3_xmit_timeout(struct request *req, bool reserved) { struct dnbd3_cmd *cmd = blk_mq_rq_to_pdu(req); struct dnbd3_device *dev = cmd->dnbd3; int i; warn_dev(dev, "received timeout"); if (!mutex_trylock(&cmd->lock)) { return BLK_EH_RESET_TIMER; } for (i = 0; i < NUMBER_CONNECTIONS; i++) { if (dnbd3_is_sock_alive(dev->socks[i])) { info_sock(&dev->socks[i], "reset request to new socket"); dnbd3_requeue_cmd(cmd); return BLK_EH_DONE; } } dev_err_ratelimited(disk_to_dev(dev->disk), "connection timed out\n"); cmd->status = BLK_STS_IOERR; blk_mq_complete_request(req); blk_mq_end_request(req, BLK_STS_TIMEOUT); return BLK_EH_DONE; } static struct blk_mq_ops dnbd3_mq_ops = { .queue_rq = dnbd3_queue_rq, .init_request = dnbd3_init_request, .timeout = dnbd3_xmit_timeout, }; static int dnbd3_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { int result = -EIO; struct dnbd3_device *dev = bdev->bd_disk->private_data; char *imgname = NULL; dnbd3_ioctl_t *msg = NULL; debug_dev(dev, "ioctl cmd %i, arg %lu", cmd, arg); if (arg != 0) { msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (msg == NULL) return -ENOMEM; if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg)) { result = -ENOEXEC; goto cleanup_return; } if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0) { result = -ENOENT; goto cleanup_return; } if (msg->imgname != NULL && msg->imgnamelen > 0) { imgname = kmalloc(msg->imgnamelen + 1, GFP_KERNEL); if (imgname == NULL) { result = -ENOMEM; goto cleanup_return; } if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0) { result = -ENOENT; goto cleanup_return; } imgname[msg->imgnamelen] = '\0'; debug_dev(dev, "ioctl image name of len %i is %s", (int)msg->imgnamelen, imgname); } } mutex_lock(&dev->device_lock); switch (cmd) { case IOCTL_OPEN: debug_dev(dev, "ioctl open"); if (dev->imgname != NULL) { result = -EBUSY; } else if (imgname == NULL) { result = -EINVAL; } else if (msg == NULL) { result = -EINVAL; } else { if (sizeof(msg->host) != sizeof(dev->initial_server.host)) { warn_dev(dev, "odd size bug#1 triggered in ioctl"); } memcpy(&dev->initial_server.host, &msg->host, sizeof(msg->host)); dev->initial_server.failures = 0; dev->initial_server.rtts[0] = dev->initial_server.rtts[1] = dev->initial_server.rtts[2] = dev->initial_server.rtts[3] = RTT_UNREACHABLE; // memcpy(&dev->initial_server, &dev->cur_server, sizeof(dev->initial_server)); dev->imgname = imgname; dev->rid = msg->rid; dev->use_server_provided_alts = msg->use_server_provided_alts; // Forget all alt servers on explicit connect, set first alt server to initial server memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])*NUMBER_SERVERS); memcpy(dev->alt_servers, &dev->initial_server, sizeof(dev->alt_servers[0])); result = dnbd3_net_connect(dev); imgname = NULL; } break; case IOCTL_CLOSE: debug_dev(dev, "ioctl close"); result = dnbd3_net_disconnect(dev); set_capacity(dev->disk, 0); if (dev->imgname) { kfree(dev->imgname); dev->imgname = NULL; } dev->rid = 0; dev->reported_size = 0; break; case IOCTL_SWITCH: debug_dev(dev, "ioctl switch"); result = -EINVAL; break; case IOCTL_ADD_SRV: case IOCTL_REM_SRV: debug_dev(dev, "ioctl add/rem srv"); if (dev->imgname == NULL) { result = -ENOENT; } else if (dev->new_servers_num >= NUMBER_SERVERS) { result = -EAGAIN; } else if (msg == NULL) { result = -EINVAL; } else { memcpy(&dev->new_servers[dev->new_servers_num].host, &msg->host, sizeof(msg->host)); dev->new_servers[dev->new_servers_num].failures = (cmd == IOCTL_ADD_SRV ? 0 : 1); // 0 = ADD, 1 = REM ++dev->new_servers_num; result = 0; } break; case BLKFLSBUF: debug_dev(dev, "ioctl blkflsbuf"); result = 0; break; default: warn_dev(dev, "ioctl unhandled cmd %d", cmd); result = -EIO; break; } mutex_unlock(&dev->device_lock); cleanup_return: if (msg) kfree(msg); if (imgname) kfree(imgname); return result; } static struct block_device_operations dnbd3_fops = { .owner = THIS_MODULE, .ioctl = dnbd3_ioctl, .compat_ioctl = dnbd3_ioctl, }; int dnbd3_add_device(struct dnbd3_device *dev, int minor) { struct gendisk *disk; struct request_queue *q; int err = -ENOMEM; int i; debug("adding device %d", minor); mutex_init(&dev->device_lock); mutex_lock(&dev->device_lock); for (i = 0; i < NUMBER_CONNECTIONS; i++) { dev->socks[i].device = dev; dev->socks[i].sock_nr = i; } disk = alloc_disk(1); if (!disk) { error_dev(dev, "allocating disc failed"); goto out_free_dnbd3; } err = idr_alloc(&dnbd3_index_idr, dev, minor, minor + 1, GFP_KERNEL); if (err == -ENOSPC) { error_dev(dev, "idr alloc failed"); err = -EEXIST; } if (err < 0) goto out_free_disk; dev->minor = minor; dev->disk = disk; dev->tag_set.ops = &dnbd3_mq_ops; dev->tag_set.nr_hw_queues = 1; // this can be changed later with blk_mq_update_nr_hw_queues() dev->tag_set.queue_depth = 128; dev->tag_set.numa_node = NUMA_NO_NODE; dev->tag_set.cmd_size = sizeof(struct dnbd3_cmd); dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING; dev->tag_set.driver_data = dev; err = blk_mq_alloc_tag_set(&dev->tag_set); if (err) goto out_free_idr; q = blk_mq_init_queue(&dev->tag_set); if (IS_ERR(q)) { err = PTR_ERR(q); goto out_free_tags; } disk->queue = q; /* * Tell the block layer that we are not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); disk->queue->limits.discard_granularity = 0; disk->queue->limits.discard_alignment = 0; blk_queue_max_discard_sectors(disk->queue, 0); blk_queue_max_segment_size(disk->queue, UINT_MAX); blk_queue_max_segments(disk->queue, USHRT_MAX); blk_queue_max_hw_sectors(disk->queue, 65536); disk->queue->limits.max_sectors = 256; INIT_LIST_HEAD(&dev->list); disk->major = major; disk->first_minor = minor; disk->fops = &dnbd3_fops; disk->private_data = dev; sprintf(disk->disk_name, "dnbd%i", minor); add_disk(disk); dnbd3_sysfs_init(dev); mutex_unlock(&dev->device_lock); return minor; out_free_tags: blk_mq_free_tag_set(&dev->tag_set); out_free_idr: idr_remove(&dnbd3_index_idr, minor); out_free_disk: put_disk(disk); out_free_dnbd3: kfree(dev); mutex_unlock(&dev->device_lock); warn_dev(dev, "failed to create device"); return err; } static int __init dnbd3_init(void) { int i; debug("starting kernel module"); dnbd3_wq = alloc_workqueue("kdnbd3", WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND, 0); if (max_devs < 0) { error("max_devs must be >= 0"); return -EINVAL; } device = kcalloc(max_devs, sizeof(*device), GFP_KERNEL); if (!device) { error("failed to create dnbd3 device"); return -ENOMEM; } // initialize block device major = register_blkdev(0, "dnbd3"); if (major == 0) { error("register_blkdev failed"); return -EIO; } debug("kernel module loaded. Machine type: " ENDIAN_MODE); // add MAX_NUMBER_DEVICES devices mutex_lock(&dnbd3_index_mutex); for (i = 0; i < max_devs; i++) { dnbd3_add_device(&device[i], i); } mutex_unlock(&dnbd3_index_mutex); info("init successful (%i devices)", max_devs); return 0; } static int dnbd3_exit_cb(int id, void *ptr, void *data) { struct list_head *list = (struct list_head *)data; struct dnbd3_device *dnbd3 = ptr; list_add_tail(&dnbd3->list, list); return 0; } static void dnbd3_dev_remove(struct dnbd3_device *dnbd3) { struct gendisk *disk = dnbd3->disk; struct request_queue *q; if (disk) { q = disk->queue; del_gendisk(disk); blk_cleanup_queue(q); blk_mq_free_tag_set(&dnbd3->tag_set); dnbd3_net_disconnect(dnbd3); disk->private_data = NULL; put_disk(disk); if (dnbd3->imgname) { kfree(dnbd3->imgname); dnbd3->imgname = NULL; } } mutex_destroy(&dnbd3->device_lock); } static void dnbd3_put(struct dnbd3_device *dnbd3) { mutex_lock(&dnbd3_index_mutex); idr_remove(&dnbd3_index_idr, dnbd3->minor); mutex_unlock(&dnbd3_index_mutex); dnbd3_dev_remove(dnbd3); } static void __exit dnbd3_exit(void) { struct dnbd3_device *dnbd3; LIST_HEAD(del_list); debug("stopping kernel module"); mutex_lock(&dnbd3_index_mutex); idr_for_each(&dnbd3_index_idr, &dnbd3_exit_cb, &del_list); mutex_unlock(&dnbd3_index_mutex); while (!list_empty(&del_list)) { dnbd3 = list_first_entry(&del_list, struct dnbd3_device, list); dnbd3_sysfs_exit(dnbd3); list_del_init(&dnbd3->list); dnbd3_put(dnbd3); } idr_destroy(&dnbd3_index_idr); unregister_blkdev(major, "dnbd3"); kfree(device); destroy_workqueue(dnbd3_wq); info("stopped kernel module"); } module_init(dnbd3_init); module_exit(dnbd3_exit); MODULE_DESCRIPTION("Distributed Network Block Device 3"); MODULE_LICENSE("GPL"); module_param(max_devs, int, 0444); MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)");