summaryrefslogblamecommitdiffstats
path: root/src/kernel/core.c
blob: 8bb23d9576d857b70159ccd1a68268b40e7f02dd (plain) (tree)

















































                                                               
                  


                         
 

                                  

                                

                                              
                                   

          



                                             


                                                      

                                   



                                                  




                                          



                                                             
                                       
                              
                     

                           
 

                                                                           
 
      
 
                                                  
                                                         
                                                  
                                                      




                                     
                                                                                                        
                                                                            
                                                                                                  

                                                                              


                                               
                 






                                                                          
                                                       





                                 
                                   
                                    
                                             
                                                 









                                                              
                                                                     





                                                                                                 
                                     


                   






                                                                     


                                                         
 
                               
                            



                                                     
                              






                                 








                                                                             


                                                     
                            


                               







                                                                       
 


                                                      
                                          





                                                  

                                                                                 







                                                                              
                                                 


                           



                                   

                                         





                                           








                                                               
 
                          
                                                               


                                  

                                                          


                                                        




                                                                     
                                          
                                   
                 

                                                                                
                                         
                                   




                                                                           
                                           
                         


                                                                                
                                                 
                                           


                                                        

                                                                          





                                      
                                             







                                                                                    
                                                                                   
                         

                                                                     
                                                         
                                                                       

                                               









                                                                                





                                                        
                                              
                                                   




                                            

                                       


                          
                                               




                                 
                                                    






                                                                    




                                                                            





                                               
                                                  



                           
                                                             



                                        
      






                                    



                                              









                                                  




                                         
                                                         



                                
              
                                         
 

                                      
 

                                                  
                                          

         

                             
                                                         
                                    



                                                                             
                                                   








                                         

                                                                         

                                              
                                                         



























                                                                   




                                   
                                                  

                              
 
                                        







                                            
               
                   
                                        
                                                 




                   


                                           


                                  
                                        
 







                                                                       
 
                           
                                               



                               

                                                                
                                                       





                                            
                                                


                            
                                                                  



                                         
                                                


                                         
                                                       



                 





                                                     


                                                          
                                         




                                          



                                             
                                                      
 
                                         

                                

                                  



                                     
                                                   


                                          




                                         

 


                                    

                                   
                                   
                            
                                        





                                                                  
                                                                               

                                            



                                                           




                                          
                      
 

                                    
                                      










                                                                                         
/*
 * This file is part of the Distributed Network Block Device 3
 *
 * Copyright(c) 2019 Frederic Robra <frederic@robra.org>
 * Parts copyright 2011-2012 Johann Latocha <johann@latocha.de>
 *
 * This file may be licensed under the terms of of the
 * GNU General Public License Version 2 (the ``GPL'').
 *
 * Software distributed under the License is distributed
 * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
 * express or implied. See the GPL for the specific language
 * governing rights and limitations.
 *
 * You should have received a copy of the GPL along with this
 * program. If not, go to http://www.gnu.org/licenses/gpl.html
 * or write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 */

#include <linux/major.h>

#include <linux/blkdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
#include <linux/mutex.h>
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <linux/net.h>
#include <linux/kthread.h>
#include <linux/types.h>
#include <linux/debugfs.h>
#include <linux/blk-mq.h>

#include <linux/uaccess.h>
#include <asm/types.h>

#include "dnbd3.h"
#include "sysfs.h"
#include "clientconfig.h"
#include "net.h"



struct workqueue_struct *dnbd3_wq;
DEFINE_IDR(dnbd3_index_idr);
DEFINE_MUTEX(dnbd3_index_mutex);

static unsigned int max_devs = NUMBER_DEVICES;
static struct dnbd3_device *device;
int major;

/**
 * dnbd3_requeue_cmd - requeue a command once
 * @cmd: the command to requeue
 */
static void dnbd3_requeue_cmd(struct dnbd3_cmd *cmd)
{
	struct request *req = blk_mq_rq_from_pdu(cmd);
	if (!cmd->requed) {
		cmd->requed = true;
		blk_mq_requeue_request(req, true);
	}
}

/**
 * dnbd3_handle_cmd - handles a mq command
 * @cmd: the cmd to send
 * @index: the index of the queue
 */
static int dnbd3_handle_cmd(struct dnbd3_cmd *cmd, int index)
{
	struct request *req = blk_mq_rq_from_pdu(cmd);
	struct dnbd3_device *dev = cmd->dnbd3;
	struct dnbd3_sock *sock = NULL;
	bool first_try = true;
	int ret = -1;
	int i;
	int sock_alive = 0;

	debug_dev(dev, "handle request at position %lu, size %d, index %d",
			blk_rq_pos(req), blk_rq_bytes(req), index);

again:

	for (i = 0; i < NUMBER_CONNECTIONS; i++) {
		if (dnbd3_is_sock_alive(dev->socks[i])) {
			if (index == sock_alive) {
				sock = &dev->socks[i];
			}
			sock_alive++;
		}
	}

	if (!sock) { // TODO let the mq queues be the same number as NUMBER_CONNECTIONS than handle them
		warn_dev(dev, "index is %d but no socket was found", index);
		dev_err_ratelimited(disk_to_dev(dev->disk), "attempted send on invalid socket\n");
		if (sock_alive > 0) {
			blk_mq_update_nr_hw_queues(&dev->tag_set, sock_alive);
			dnbd3_requeue_cmd(cmd);
			ret = 0;
			goto out;
		}
		if (first_try) {
			debug_dev(dev, "no socket found, going to sleep");
			msleep(SOCKET_TIMEOUT_CLIENT_DATA * 1000);
			first_try = false;
			goto again;
		}
		error_dev(dev, "failed to find a socket, end request");
		blk_mq_end_request(req, BLK_STS_IOERR);
		return -EINVAL;
	}


	cmd->status = BLK_STS_OK;

	mutex_lock(&sock->tx_lock);
	if (unlikely(!sock->sock)) {
		mutex_unlock(&sock->tx_lock);
		warn_sock(sock, "not connected");
		return -EIO;
	}

	blk_mq_start_request(req);
	if (unlikely(sock->pending && sock->pending != req)) {
		dnbd3_requeue_cmd(cmd);
		ret = 0;
		goto out;
	}

	ret = dnbd3_send_request(sock, blk_mq_rq_from_pdu(cmd), cmd);
	if (ret == -EAGAIN) {
		dev_err_ratelimited(disk_to_dev(dev->disk), "request send failed, requeueing\n");
		dnbd3_requeue_cmd(cmd);
		ret = 0;
	}
out:
	mutex_unlock(&sock->tx_lock);
	return ret;
}

/**
 * dnbd3_queue_rq - queue request
 * @hctx: state for a hardware queue facing the hardware block device
 * @bd: the queue data including the request
 */
static blk_status_t dnbd3_queue_rq(struct blk_mq_hw_ctx *hctx,
		const struct blk_mq_queue_data *bd)
{
	struct dnbd3_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
	int ret;

	mutex_lock(&cmd->lock);
	cmd->requed = false;

	ret = dnbd3_handle_cmd(cmd, hctx->queue_num);
	if (ret < 0) {
		ret = BLK_STS_IOERR;
	} else if (ret >= 0) {
		ret = BLK_STS_OK;
	}
	mutex_unlock(&cmd->lock);

	return ret;
}

/**
 * dnbd3_init_request - init a mq request
 * @set: the mq tag set
 * @rq: the request
 * @hctx_idx:
 * @numa_node:
 */
static int dnbd3_init_request(struct blk_mq_tag_set *set, struct request *rq,
		unsigned int hctx_idx, unsigned int numa_node)
{
	struct dnbd3_cmd *cmd = blk_mq_rq_to_pdu(rq);
	cmd->dnbd3 = set->driver_data;
	cmd->requed = false;
	mutex_init(&cmd->lock);
	return 0;
}

/**
 * dnbd3_xmit_timeout - timeout function for mq
 * @req: the timedout request
 * @reserved:
 */
static enum blk_eh_timer_return dnbd3_xmit_timeout(struct request *req,
		bool reserved)
{
	struct dnbd3_cmd *cmd = blk_mq_rq_to_pdu(req);
	struct dnbd3_device *dev = cmd->dnbd3;
	int i;
	warn_dev(dev, "received timeout");

	if (!mutex_trylock(&cmd->lock)) {
		return BLK_EH_RESET_TIMER;
	}

	for (i = 0; i < NUMBER_CONNECTIONS; i++) {
		if (dnbd3_is_sock_alive(dev->socks[i])) {
			info_sock(&dev->socks[i], "reset request to new socket");
			dnbd3_requeue_cmd(cmd);
			return BLK_EH_DONE;
		}
	}

	dev_err_ratelimited(disk_to_dev(dev->disk), "connection timed out\n");
	cmd->status = BLK_STS_IOERR;
	blk_mq_complete_request(req);
	blk_mq_end_request(req, BLK_STS_TIMEOUT);
	return BLK_EH_DONE;
}

/**
 * struct blk_mq_ops - dnbd3_mq_ops
 * multiqueue operations
 */
static struct blk_mq_ops dnbd3_mq_ops = {
	.queue_rq = dnbd3_queue_rq,
	.init_request = dnbd3_init_request,
	.timeout = dnbd3_xmit_timeout,
};



/**
 * dnbd3_ioctl - the ioctl function of the dnbd3 kernel modul
 * @bdev: the block device
 * @mode:
 * @cmd: the ioctl command
 * @arg: the user data
 */
static int dnbd3_ioctl(struct block_device *bdev, fmode_t mode,
		unsigned int cmd, unsigned long arg)
{
	int result = -EIO;
	struct dnbd3_device *dev = bdev->bd_disk->private_data;
	char *imgname = NULL;
	dnbd3_ioctl_t *msg = NULL;


	debug_dev(dev, "ioctl cmd %i, arg %lu", cmd, arg);

	if (arg != 0) {
		msg = kmalloc(sizeof(*msg), GFP_KERNEL);
		if (msg == NULL) {
			return -ENOMEM;
		}
		result = copy_from_user((char *)msg, (char *)arg, 2);
		if (result != 0	|| msg->len != sizeof(*msg)) {
			result = -ENOEXEC;
			goto error;
		}
		result = copy_from_user((char *)msg, (char *)arg, sizeof(*msg));
		if (result != 0) {
			result = -ENOENT;
			goto error;
		}
		if (msg->imgname != NULL && msg->imgnamelen > 0) {
			imgname = kmalloc(msg->imgnamelen + 1, GFP_KERNEL);
			if (imgname == NULL) {
				result = -ENOMEM;
				goto error;
			}
			result = copy_from_user(
					imgname, msg->imgname, msg->imgnamelen);
			if (result != 0) {
				result = -ENOENT;
				goto error;
			}
			imgname[msg->imgnamelen] = '\0';

			debug_dev(dev, "ioctl image name of len %i is %s",
					(int)msg->imgnamelen, imgname);
		}
	}

	mutex_lock(&dev->device_lock);
	switch (cmd) {
	case IOCTL_OPEN:
		debug_dev(dev, "ioctl open");
		if (dev->imgname != NULL) {
			result = -EBUSY;
		} else if (imgname == NULL) {
			result = -EINVAL;
		} else if (msg == NULL) {
			result = -EINVAL;
		} else {
			if (sizeof(msg->host) != sizeof(dev->initial_server.host)) {
				warn_dev(dev, "odd size bug#1 triggered in ioctl");
			}
			memcpy(&dev->initial_server.host, &msg->host,
					sizeof(msg->host));
			dev->initial_server.failures = 0;
			dnbd3_set_rtt_unreachable(&dev->initial_server)
			dev->imgname = imgname;
			dev->rid = msg->rid;
			dev->use_server_provided_alts =
					msg->use_server_provided_alts;
			/*
			 * forget all alt servers on explicit connect, set first
			 * alt server to initial server
			 */
			memset(dev->alt_servers, 0, sizeof(dev->alt_servers[0])
					* NUMBER_SERVERS);
			memcpy(dev->alt_servers, &dev->initial_server,
					sizeof(dev->alt_servers[0]));
			result = dnbd3_net_connect(dev);
			imgname = NULL;
		}
		break;

	case IOCTL_CLOSE:
		debug_dev(dev, "ioctl close");
		result = dnbd3_net_disconnect(dev);
		set_capacity(dev->disk, 0);
		if (dev->imgname) {
			kfree(dev->imgname);
			dev->imgname = NULL;
		}
		dev->rid = 0;
		dev->reported_size = 0;
		break;

	case IOCTL_SWITCH:
		debug_dev(dev, "ioctl switch");
		result = -EINVAL;
		break;

	case IOCTL_ADD_SRV:
	case IOCTL_REM_SRV:
		debug_dev(dev, "ioctl add/rem srv");
		if (dev->imgname == NULL) {
			result = -ENOENT;
		} else if (dev->new_servers_num >= NUMBER_SERVERS) {
			result = -EAGAIN;
		} else if (msg == NULL) {
			result = -EINVAL;
		} else {
			memcpy(&dev->new_servers[dev->new_servers_num].host,
					&msg->host, sizeof(msg->host));
			/* 0 = ADD, 1 = REM */
			dev->new_servers[dev->new_servers_num].failures =
					(cmd == IOCTL_ADD_SRV ? 0 : 1);
			++dev->new_servers_num;
			result = 0;
		}
		break;

	case BLKFLSBUF:
		debug_dev(dev, "ioctl blkflsbuf");
		result = 0;
		break;

	default:
		warn_dev(dev, "ioctl unhandled cmd %d", cmd);
		result = -EIO;
		break;
	}
	mutex_unlock(&dev->device_lock);
error:
	if (msg) kfree(msg);
	if (imgname) kfree(imgname);
	return result;

}


/**
 * struct block_device_operations - dnbd3_fops
 * device operations for ioctl
 */
static struct block_device_operations dnbd3_fops =
{
	.owner = THIS_MODULE,
	.ioctl = dnbd3_ioctl,
	.compat_ioctl =	dnbd3_ioctl,
};




/**
 * dnbd3_add_device - add a dnbd3 device
 * @dev: the device
 * @minor: the minor number of the device
 */
int dnbd3_add_device(struct dnbd3_device *dev, int minor)
{
	struct gendisk *disk;
	struct request_queue *q;
	int err = -ENOMEM;
	int i;
	debug("adding device %d", minor);

	mutex_init(&dev->device_lock);
	mutex_lock(&dev->device_lock);

	for (i = 0; i < NUMBER_CONNECTIONS; i++) {
		dev->socks[i].device = dev;
		dev->socks[i].sock_nr = i;
	}

	disk = alloc_disk(1);
	if (!disk) {
		error_dev(dev, "allocating disc failed");
		goto out_free_dnbd3;
	}

	err = idr_alloc(&dnbd3_index_idr, dev, minor, minor + 1, GFP_KERNEL);
	if (err == -ENOSPC) {
		error_dev(dev, "idr alloc failed");
		err = -EEXIST;
	}

	if (err < 0)
		goto out_free_disk;

	dev->minor = minor;
	dev->disk = disk;
	dev->tag_set.ops = &dnbd3_mq_ops;
	/* this can be changed later with blk_mq_update_nr_hw_queues() */
	dev->tag_set.nr_hw_queues = 1;
	dev->tag_set.queue_depth = 128;
	dev->tag_set.numa_node = NUMA_NO_NODE;
	dev->tag_set.cmd_size = sizeof(struct dnbd3_cmd);
	dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
		BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
	dev->tag_set.driver_data = dev;

	err = blk_mq_alloc_tag_set(&dev->tag_set);
	if (err)
		goto out_free_idr;

	q = blk_mq_init_queue(&dev->tag_set);
	if (IS_ERR(q)) {
		err = PTR_ERR(q);
		goto out_free_tags;
	}
	disk->queue = q;

	/*
	 * Tell the block layer that we are not a rotational device
	 */
	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
	disk->queue->limits.discard_granularity = 0;
	disk->queue->limits.discard_alignment = 0;
	blk_queue_max_discard_sectors(disk->queue, 0);
	blk_queue_max_segment_size(disk->queue, UINT_MAX);
	blk_queue_max_segments(disk->queue, USHRT_MAX);
	blk_queue_max_hw_sectors(disk->queue, 65536);
	disk->queue->limits.max_sectors = 256;

	INIT_LIST_HEAD(&dev->list);
	disk->major = major;
	disk->first_minor = minor;
	disk->fops = &dnbd3_fops;
	disk->private_data = dev;
	sprintf(disk->disk_name, "dnbd%i", minor);
	add_disk(disk);
	dnbd3_sysfs_init(dev);

	mutex_unlock(&dev->device_lock);
	return minor;

out_free_tags:
	blk_mq_free_tag_set(&dev->tag_set);
out_free_idr:
	idr_remove(&dnbd3_index_idr, minor);
out_free_disk:
	put_disk(disk);
out_free_dnbd3:
	kfree(dev);
	mutex_unlock(&dev->device_lock);
	warn_dev(dev, "failed to create device");
	return err;
}



/**
 * dnbd3_init - init the dnbd3 kernel modul
 */
static int __init dnbd3_init(void)
{
	int i;
	debug("starting kernel module");

	/*
	 * allocate a workqueue/thread for this modul
	 * WQ_MEM_RECLAIM - it is allowed to allocate memory
	 * WQ_FREEZABLE - can go to sleep
	 * WQ_UNBOUND - not bound to a certain CPU
	 */
	dnbd3_wq = alloc_workqueue("kdnbd3",
			WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND, 0);

	if (max_devs < 0) {
		error("max_devs must be >= 0");
		return -EINVAL;
	}


	device = kcalloc(max_devs, sizeof(*device), GFP_KERNEL);
	if (!device) {
		error("failed to create dnbd3 device");
		return -ENOMEM;
	}

	// initialize block device
	major = register_blkdev(0, "dnbd3");
	if (major == 0) {
		error("register_blkdev failed");
		return -EIO;
	}

	debug("kernel module loaded. Machine type: " ENDIAN_MODE);

	// add MAX_NUMBER_DEVICES devices
	mutex_lock(&dnbd3_index_mutex);
	for (i = 0; i < max_devs; i++) {
		dnbd3_add_device(&device[i], i);
	}
	mutex_unlock(&dnbd3_index_mutex);

	info("init successful (%i devices)", max_devs);

	return 0;
}

/**
 * dnbd3_exit_cb - callback function for idr_for_each
 * @id: the id
 * @ptr: the entry
 * @data: the callback data
 */
static int dnbd3_exit_cb(int id, void *ptr, void *data)
{
	struct list_head *list = (struct list_head *)data;
	struct dnbd3_device *dnbd3 = ptr;

	list_add_tail(&dnbd3->list, list);
	return 0;
}

/**
 * dnbd3_dev_remove - remove the dnbd3 device
 * @dev: the device to remove
 */
static void dnbd3_dev_remove(struct dnbd3_device *dev)
{
	struct gendisk *disk = dev->disk;
	struct request_queue *q;

	dnbd3_net_disconnect(dev);

	if (disk) {
		q = disk->queue;
		del_gendisk(disk);
		blk_cleanup_queue(q);
		blk_mq_free_tag_set(&dev->tag_set);
		disk->private_data = NULL;
		put_disk(disk);
	}
	if (dev->imgname) {
		kfree(dev->imgname);
		dev->imgname = NULL;
	}
	mutex_destroy(&dev->device_lock);
}

/**
 * dnbd3_exit - exit the dnbd3 modul
 */
static void __exit dnbd3_exit(void)
{
	struct dnbd3_device *dnbd3;
	LIST_HEAD(del_list);
	debug("stopping kernel module");

	mutex_lock(&dnbd3_index_mutex);
	idr_for_each(&dnbd3_index_idr, &dnbd3_exit_cb, &del_list);
	mutex_unlock(&dnbd3_index_mutex);

	while (!list_empty(&del_list)) {
		dnbd3 = list_first_entry(&del_list, struct dnbd3_device, list);
		dnbd3_sysfs_exit(dnbd3);
		list_del_init(&dnbd3->list);
		mutex_lock(&dnbd3_index_mutex);
		idr_remove(&dnbd3_index_idr, dnbd3->minor);
		mutex_unlock(&dnbd3_index_mutex);
		dnbd3_dev_remove(dnbd3);
	}

	idr_destroy(&dnbd3_index_idr);
	unregister_blkdev(major, "dnbd3");

	kfree(device);

	destroy_workqueue(dnbd3_wq);

	info("stopped kernel module");
}


module_init(dnbd3_init);
module_exit(dnbd3_exit);

MODULE_DESCRIPTION("Distributed Network Block Device 3");
MODULE_LICENSE("GPL");

module_param(max_devs, int, 0444);
MODULE_PARM_DESC(max_devs, "number of network block devices to initialize (default: 8)");