// SPDX-License-Identifier: GPL-2.0 /* * This file is part of the Distributed Network Block Device 3 * * Copyright(c) 2011-2012 Johann Latocha * * This file may be licensed under the terms of the * GNU General Public License Version 2 (the ``GPL''). * * Software distributed under the License is distributed * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either * express or implied. See the GPL for the specific language * governing rights and limitations. * * You should have received a copy of the GPL along with this * program. If not, go to http://www.gnu.org/licenses/gpl.html * or write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * */ #include #include "blk.h" #include "net.h" #include "sysfs.h" #include "dnbd3_main.h" #include static int dnbd3_close_device(dnbd3_device_t *dev) { int result; if (dev->imgname) dev_info(dnbd3_device_to_dev(dev), "closing down device.\n"); dev->panic = false; result = dnbd3_net_disconnect(dev); kfree(dev->imgname); dev->imgname = NULL; /* new requests might have been queued up, */ /* but now that imgname is NULL no new ones can show up */ blk_mq_freeze_queue(dev->queue); set_capacity(dev->disk, 0); blk_mq_unfreeze_queue(dev->queue); return result; } static int dnbd3_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { int result = -100; dnbd3_device_t *dev = bdev->bd_disk->private_data; #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0) struct request_queue *blk_queue = dev->disk->queue; #endif char *imgname = NULL; dnbd3_ioctl_t *msg = NULL; int i = 0, j; u8 locked = 0; if (arg != 0) { msg = kmalloc(sizeof(*msg), GFP_KERNEL); if (msg == NULL) return -ENOMEM; if (copy_from_user((char *)msg, (char *)arg, 2) != 0 || msg->len != sizeof(*msg)) { result = -ENOEXEC; goto cleanup_return; } if (copy_from_user((char *)msg, (char *)arg, sizeof(*msg)) != 0) { result = -ENOENT; goto cleanup_return; } if (msg->imgname != NULL && msg->imgnamelen > 0) { imgname = kmalloc(msg->imgnamelen + 1, GFP_KERNEL); if (imgname == NULL) { result = -ENOMEM; goto cleanup_return; } if (copy_from_user(imgname, msg->imgname, msg->imgnamelen) != 0) { result = -ENOENT; goto cleanup_return; } imgname[msg->imgnamelen] = '\0'; } } switch (cmd) { case IOCTL_OPEN: if (!dnbd3_flag_get(dev->connection_lock)) { result = -EBUSY; break; } locked = 1; if (dev->imgname != NULL) { result = -EBUSY; } else if (imgname == NULL) { result = -EINVAL; } else if (msg == NULL) { result = -EINVAL; } else { /* assert that at least one and not to many hosts are given */ if (msg->hosts_num < 1 || msg->hosts_num > NUMBER_SERVERS) { result = -EINVAL; break; } dev->imgname = imgname; dev->rid = msg->rid; dev->use_server_provided_alts = msg->use_server_provided_alts; dev_info(dnbd3_device_to_dev(dev), "opening device.\n"); #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0) // set optimal request size for the queue to half the read-ahead blk_queue_io_opt(dev->queue, (msg->read_ahead_kb * 512)); #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 15, 0) \ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) // set readahead from optimal request size of the queue // ra_pages are calculated by following formula: queue_io_opt() * 2 / PAGE_SIZE blk_queue_update_readahead(dev->queue); #endif #elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) if (blk_queue->backing_dev_info != NULL) blk_queue->backing_dev_info->ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE; #else blk_queue->backing_dev_info.ra_pages = (msg->read_ahead_kb * 1024) / PAGE_SIZE; #endif /* add specified servers to alt server list */ for (i = 0; i < NUMBER_SERVERS; i++) dev->alt_servers[i].host.ss_family = 0; for (i = 0; i < msg->hosts_num; i++) { /* copy provided host into corresponding alt server slot */ if (dnbd3_add_server(dev, &msg->hosts[i]) == 0) dev_dbg(dnbd3_device_to_dev(dev), "adding server %pISpc\n", &dev->alt_servers[i].host); else dev_warn(dnbd3_device_to_dev(dev), "could not add server %pISpc\n", &dev->alt_servers[i].host); } /* * probe added alt servers in specified order and * choose first working server as initial server */ result = -EPROTONOSUPPORT; for (i = 0; i < NUMBER_SERVERS; i++) { /* probe added alt server */ if (dev->alt_servers[i].host.ss_family == 0) continue; // Empty slot result = dnbd3_new_connection(dev, &dev->alt_servers[i].host, true); if (result == 0) { /* connection established, store index of server and exit loop */ result = i; break; } } if (result >= 0) { /* connection was successful */ dev_dbg(dnbd3_device_to_dev(dev), "server %pISpc is initial server\n", &dev->cur_server.host); imgname = NULL; // Prevent kfree at the end } else { /* probing failed */ dev->imgname = NULL; } } break; case IOCTL_CLOSE: if (!dnbd3_flag_get(dev->connection_lock)) { result = -EBUSY; break; } locked = 1; result = dnbd3_close_device(dev); break; case IOCTL_SWITCH: if (!dnbd3_flag_get(dev->connection_lock)) { result = -EBUSY; break; } locked = 1; if (dev->imgname == NULL) { result = -ENOTCONN; } else if (msg == NULL) { result = -EINVAL; } else { dnbd3_alt_server_t *alt_server; struct sockaddr_storage new_addr; mutex_lock(&dev->alt_servers_lock); alt_server = get_existing_alt_from_host(&msg->hosts[0], dev); if (alt_server == NULL) { mutex_unlock(&dev->alt_servers_lock); /* specified server is not known, so do not switch */ result = -ENOENT; } else { /* specified server is known, so try to switch to it */ new_addr = alt_server->host; mutex_unlock(&dev->alt_servers_lock); if (is_same_server(&dev->cur_server.host, &new_addr)) { /* specified server is current server, so do not switch */ result = 0; } else { dev_info(dnbd3_device_to_dev(dev), "manual server switch to %pISpc\n", &new_addr); result = dnbd3_new_connection(dev, &new_addr, false); if (result != 0) { /* switching didn't work */ result = -EAGAIN; } } if (result == 0) { /* fake RTT so we don't switch away again soon */ mutex_lock(&dev->alt_servers_lock); for (i = 0; i < NUMBER_SERVERS; ++i) { alt_server = &dev->alt_servers[i]; if (is_same_server(&alt_server->host, &new_addr)) { for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j) alt_server->rtts[j] = 1; alt_server->best_count = 100; } else { for (j = 0; j < DISCOVER_HISTORY_SIZE; ++j) if (alt_server->rtts[j] < 500000) alt_server->rtts[j] = 500000; alt_server->best_count = 0; } } mutex_unlock(&dev->alt_servers_lock); } } } break; case IOCTL_ADD_SRV: case IOCTL_REM_SRV: { struct sockaddr_storage addr; dnbd3_host_t *host; if (dev->imgname == NULL) { result = -ENOTCONN; break; } if (msg == NULL) { result = -EINVAL; break; } host = &msg->hosts[0]; if (!dnbd3_host_to_sockaddr(host, &addr)) { result = -EINVAL; break; } if (cmd == IOCTL_ADD_SRV) { result = dnbd3_add_server(dev, host); if (result == -EEXIST) dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc already exists\n", &addr); else if (result == -ENOSPC) dev_info(dnbd3_device_to_dev(dev), "cannot add %pISpc; no free slot\n", &addr); else dev_info(dnbd3_device_to_dev(dev), "added alt server %pISpc\n", &addr); } else { // IOCTL_REM_SRV result = dnbd3_rem_server(dev, host); if (result == -ENOENT) dev_info(dnbd3_device_to_dev(dev), "alt server %pISpc not found\n", &addr); else dev_info(dnbd3_device_to_dev(dev), "removed alt server %pISpc\n", &addr); } break; } case BLKFLSBUF: result = 0; break; default: result = -EIO; break; } cleanup_return: kfree(msg); kfree(imgname); if (locked) dnbd3_flag_reset(dev->connection_lock); return result; } static const struct block_device_operations dnbd3_blk_ops = { .owner = THIS_MODULE, .ioctl = dnbd3_blk_ioctl, }; static void dnbd3_add_queue(dnbd3_device_t *dev, struct request *rq) { unsigned long irqflags; spin_lock_irqsave(&dev->send_queue_lock, irqflags); list_add_tail(&rq->queuelist, &dev->send_queue); spin_unlock_irqrestore(&dev->send_queue_lock, irqflags); spin_lock_irqsave(&dev->blk_lock, irqflags); queue_work(dev->send_wq, &dev->send_work); spin_unlock_irqrestore(&dev->blk_lock, irqflags); } /* * Linux kernel blk-mq driver function (entry point) to handle block IO requests */ static blk_status_t dnbd3_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; dnbd3_device_t *dev = rq->q->queuedata; struct dnbd3_cmd *cmd; if (dev->imgname == NULL || !device_active(dev)) return BLK_STS_IOERR; if (req_op(rq) != REQ_OP_READ) return BLK_STS_IOERR; if (PROBE_COUNT_TIMEOUT > 0 && dev->panic_count >= PROBE_COUNT_TIMEOUT) return BLK_STS_TIMEOUT; if (rq_data_dir(rq) != READ) return BLK_STS_NOTSUPP; cmd = blk_mq_rq_to_pdu(rq); cmd->handle = (u64)blk_mq_unique_tag(rq) | (((u64)jiffies) << 32); blk_mq_start_request(rq); dnbd3_add_queue(dev, rq); return BLK_STS_OK; } static enum blk_eh_timer_return dnbd3_rq_timeout(struct request *req #if LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) , bool reserved #endif ) { unsigned long irqflags; struct request *rq_iter; bool found = false; dnbd3_device_t *dev = req->q->queuedata; spin_lock_irqsave(&dev->send_queue_lock, irqflags); list_for_each_entry(rq_iter, &dev->send_queue, queuelist) { if (rq_iter == req) { found = true; break; } } spin_unlock_irqrestore(&dev->send_queue_lock, irqflags); // If still in send queue, do nothing if (found) return BLK_EH_RESET_TIMER; spin_lock_irqsave(&dev->recv_queue_lock, irqflags); list_for_each_entry(rq_iter, &dev->recv_queue, queuelist) { if (rq_iter == req) { found = true; list_del_init(&req->queuelist); break; } } spin_unlock_irqrestore(&dev->recv_queue_lock, irqflags); if (!found) { dev_err(dnbd3_device_to_dev(dev), "timeout request neither found in send nor recv queue, ignoring\n"); // Assume it was fnished concurrently return BLK_EH_DONE; } // Add to send queue again and trigger work, reset timeout dnbd3_add_queue(dev, req); return BLK_EH_RESET_TIMER; } static #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) const #endif struct blk_mq_ops dnbd3_mq_ops = { .queue_rq = dnbd3_queue_rq, .timeout = dnbd3_rq_timeout, }; int dnbd3_blk_add_device(dnbd3_device_t *dev, int minor) { int ret; memset(dev, 0, sizeof(*dev)); dev->index = minor; // lock for imgname, cur_server etc. spin_lock_init(&dev->blk_lock); spin_lock_init(&dev->send_queue_lock); spin_lock_init(&dev->recv_queue_lock); INIT_LIST_HEAD(&dev->send_queue); INIT_LIST_HEAD(&dev->recv_queue); dnbd3_flag_reset(dev->connection_lock); dnbd3_flag_reset(dev->discover_running); mutex_init(&dev->alt_servers_lock); dnbd3_net_work_init(dev); // memset has done this already but I like initial values to be explicit dev->imgname = NULL; dev->rid = 0; dev->update_available = false; dev->panic = false; dev->panic_count = 0; dev->reported_size = 0; // set up tag_set for blk-mq dev->tag_set.ops = &dnbd3_mq_ops; dev->tag_set.nr_hw_queues = 1; dev->tag_set.queue_depth = 128; dev->tag_set.numa_node = NUMA_NO_NODE; dev->tag_set.cmd_size = sizeof(struct dnbd3_cmd); dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; dev->tag_set.driver_data = dev; dev->tag_set.timeout = BLOCK_LAYER_TIMEOUT * HZ; ret = blk_mq_alloc_tag_set(&dev->tag_set); if (ret) { dev_err(dnbd3_device_to_dev(dev), "blk_mq_alloc_tag_set failed\n"); goto out; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 14, 0) // set up blk-mq and disk dev->disk = blk_mq_alloc_disk(&dev->tag_set, dev); if (IS_ERR(dev->disk)) { dev_err(dnbd3_device_to_dev(dev), "blk_mq_alloc_disk failed\n"); ret = PTR_ERR(dev->disk); goto out_cleanup_tags; } dev->queue = dev->disk->queue; #else // set up blk-mq dev->queue = blk_mq_init_queue(&dev->tag_set); if (IS_ERR(dev->queue)) { ret = PTR_ERR(dev->queue); dev_err(dnbd3_device_to_dev(dev), "blk_mq_init_queue failed\n"); goto out_cleanup_tags; } dev->queue->queuedata = dev; #endif blk_queue_logical_block_size(dev->queue, DNBD3_BLOCK_SIZE); blk_queue_physical_block_size(dev->queue, DNBD3_BLOCK_SIZE); #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dev->queue); #else queue_flag_set_unlocked(QUEUE_FLAG_NONROT, dev->queue); #endif #define ONE_MEG (1048576) blk_queue_max_segment_size(dev->queue, ONE_MEG); blk_queue_max_segments(dev->queue, 0xffff); blk_queue_max_hw_sectors(dev->queue, ONE_MEG / DNBD3_BLOCK_SIZE); dev->queue->limits.max_sectors = 256; #undef ONE_MEG #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0) // set up disk dev->disk = alloc_disk(1); if (!dev->disk) { dev_err(dnbd3_device_to_dev(dev), "alloc_disk failed\n"); ret = -ENOMEM; goto out_cleanup_queue; } #endif #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 17, 0) \ || (LINUX_VERSION_CODE < KERNEL_VERSION(5, 16, 0) && LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 132)) \ || RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) dev->disk->flags |= GENHD_FL_NO_PART; #else dev->disk->flags |= GENHD_FL_NO_PART_SCAN; #endif dev->disk->major = major; dev->disk->first_minor = minor; dev->disk->minors = 1; dev->disk->fops = &dnbd3_blk_ops; dev->disk->private_data = dev; dev->disk->queue = dev->queue; sprintf(dev->disk->disk_name, "dnbd%d", minor); set_capacity(dev->disk, 0); set_disk_ro(dev->disk, 1); #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) \ || RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) ret = add_disk(dev->disk); if (ret != 0) goto out_cleanup_queue; #else add_disk(dev->disk); #endif // set up sysfs dnbd3_sysfs_init(dev); return 0; out_cleanup_queue: #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0) blk_cleanup_queue(dev->queue); #elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) blk_cleanup_disk(dev->disk); #else put_disk(dev->disk); #endif out_cleanup_tags: blk_mq_free_tag_set(&dev->tag_set); out: mutex_destroy(&dev->alt_servers_lock); return ret; } int dnbd3_blk_del_device(dnbd3_device_t *dev) { while (!dnbd3_flag_get(dev->connection_lock)) schedule(); dnbd3_close_device(dev); dnbd3_sysfs_exit(dev); del_gendisk(dev->disk); #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 14, 0) blk_cleanup_queue(dev->queue); put_disk(dev->disk); #elif LINUX_VERSION_CODE < KERNEL_VERSION(6, 0, 0) \ && !RHEL_CHECK_VERSION(RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(9, 0)) blk_cleanup_disk(dev->disk); #else put_disk(dev->disk); #endif blk_mq_free_tag_set(&dev->tag_set); mutex_destroy(&dev->alt_servers_lock); return 0; } void dnbd3_blk_requeue_all_requests(dnbd3_device_t *dev) { struct request *blk_request; unsigned long flags; struct list_head local_copy; int count = 0; INIT_LIST_HEAD(&local_copy); spin_lock_irqsave(&dev->recv_queue_lock, flags); while (!list_empty(&dev->recv_queue)) { blk_request = list_entry(dev->recv_queue.next, struct request, queuelist); list_del_init(&blk_request->queuelist); list_add(&blk_request->queuelist, &local_copy); count++; } spin_unlock_irqrestore(&dev->recv_queue_lock, flags); if (count) dev_info(dnbd3_device_to_dev(dev), "re-queueing %d requests\n", count); while (!list_empty(&local_copy)) { blk_request = list_entry(local_copy.next, struct request, queuelist); list_del_init(&blk_request->queuelist); spin_lock_irqsave(&dev->send_queue_lock, flags); list_add_tail(&blk_request->queuelist, &dev->send_queue); spin_unlock_irqrestore(&dev->send_queue_lock, flags); } // Do this even if we didn't move anything from the recv list to the send // list. It might have already contained something, which needs to be // re-requested anyways if this was called because of a server switch. spin_lock_irqsave(&dev->blk_lock, flags); queue_work(dev->send_wq, &dev->send_work); spin_unlock_irqrestore(&dev->blk_lock, flags); } void dnbd3_blk_fail_all_requests(dnbd3_device_t *dev) { struct request *blk_request; unsigned long flags; struct list_head local_copy; int count = 0; INIT_LIST_HEAD(&local_copy); spin_lock_irqsave(&dev->recv_queue_lock, flags); while (!list_empty(&dev->recv_queue)) { blk_request = list_entry(dev->recv_queue.next, struct request, queuelist); list_del_init(&blk_request->queuelist); list_add(&blk_request->queuelist, &local_copy); count++; } spin_unlock_irqrestore(&dev->recv_queue_lock, flags); spin_lock_irqsave(&dev->send_queue_lock, flags); while (!list_empty(&dev->send_queue)) { blk_request = list_entry(dev->send_queue.next, struct request, queuelist); list_del_init(&blk_request->queuelist); list_add(&blk_request->queuelist, &local_copy); count++; } spin_unlock_irqrestore(&dev->send_queue_lock, flags); if (count) dev_info(dnbd3_device_to_dev(dev), "failing %d requests\n", count); while (!list_empty(&local_copy)) { blk_request = list_entry(local_copy.next, struct request, queuelist); list_del_init(&blk_request->queuelist); blk_mq_end_request(blk_request, BLK_STS_IOERR); } }