summaryrefslogblamecommitdiffstats
path: root/src/kernel/net.c
blob: e4ab60862fda4303659ea378bc400c2d36ac5b5a (plain) (tree)























                                                               
                       



                         





                                                                                                                                 
                                                                         

                                                                                                                                   

                                                                          
                                                         








                                                   
                                        
                                                                       
 
                                                                                       
                                                                                         
                                                                                                                   
 
 
                                                                  







                                                                                       
                                                             

              
                                                                         

                                                            
                                                                                             



                 





                                                                                            
                                                               


                                              
                                                         


                                        
                                                         



                                                                                           

                                      
                          


                           


                        
                                           





                                                 





                                                                                                

                                                                        


                                                           
                                  
                                                                                                     


                                                                         

                                                                              




                                                                                                     


                            



                                             
                                                                                                     



                                                               
 
                                     


                                                                                          
                                                                                              
                                                                      


                           





                             
 



                                                                       
                                                                          



                                                                
         
 










                                                                                               


                           



                                                     

                           

                                                                                   


                                  

                                                                                                                                



                                                                   









                                                                                        



























                                                                                                                  
 

                              
                                   






                                          



































































































































                                                                                                                          
                   

                          
                                           
                               
                         















































                                                                                                                                  
                                                                                        



                                                
 
                                           

                                                                                                              



                                                                                         


                                   


                                          



                                                                                                       
                         

                                     



                                                                                                      
                         

                                    
                                                                                       
                                          
                                                                                                     

                                           








                                                                                               


                                                                                                       

                                           




                                                                                  
      
                                                                                        
                                                                                            
                                        
                                                

                                                                                          
                                        


                                                                                                                                   

                                                                                                                                     



                                                                                          


 
                                               
 



















                                                                                      

 
 
                                                            
 
                                                                                          
                                                                
                                                         

 





































                                                                                                                                   
                                                                             


                                                                          
                                                                              





                                                                                                                               





                                                                         



         
                                                            
 
                                                                                             
                                       
                 
                                                                           
                                         




                                      
                  


                                     
                                                                
 
                                                                     
 


                                                                                 
 
 


                                                                                                     


                                                       

















                                                                                                                 
                                                                                                          
                                                                                                                                                 

                                                                               
                                                                                             









                                                                               
                                                                                           







                                                                                                                                            















                                                              

                                

                                              



                                                       
                                                       








                                                                                                           
 
                                                                         





                                                                                                                                     


                                               
                                                                                           
                                                                                                                


                                               
























































                                                                                                                                               

                                   











                            

         
 
                                                                                       
 
                       
                               
 

                                                                 
                            


                                                                         
                            

         
                                                    

                            
                                                                                                         












                                                                                                  
                                                                                                          








                                                                                     
                                                                                                         



                                                                                     






                                         
                      

 
                                                                                      

              
                           

                                       













                                                                                                 
 








                                                                      
 
                             

                                                               
 
                                                                     

                                                                                        
                           
 



                                                                                                             
                                                                   
 





                                                                  





                                         


                      
 
                                                                                                                  
 
              



                                                                     
                 
                                                                 
                                     

                 



                                                                                    
                                                                  
                              
                                                                                              



                                                                                                                                
 
         
                                                  


                                                                              
 








                                                                                   

                                                            
                                    
         

                                   
 

                                                                                 
 



                                         
                        





                                                  

                       

                                                  
                                                                                 







                                              

                                               

                                              
                                                                   
                                             
 




                                                                          
 











                                                                                
/*
 * This file is part of the Distributed Network Block Device 3
 *
 * Copyright(c) 2019 Frederic Robra <frederic@robra.org>
 * Parts copyright 2011-2012 Johann Latocha <johann@latocha.de>
 *
 * This file may be licensed under the terms of of the
 * GNU General Public License Version 2 (the ``GPL'').
 *
 * Software distributed under the License is distributed
 * on an ``AS IS'' basis, WITHOUT WARRANTY OF ANY KIND, either
 * express or implied. See the GPL for the specific language
 * governing rights and limitations.
 *
 * You should have received a copy of the GPL along with this
 * program. If not, go to http://www.gnu.org/licenses/gpl.html
 * or write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 */



#include <net/sock.h>
#include <linux/wait.h>

#include "dnbd3.h"
#include "clientconfig.h"


#define DNBD3_REQ_OP_SPECIAL REQ_OP_DRV_IN
#define DNBD3_REQ_OP_CONNECT REQ_OP_DRV_OUT

#define dnbd3_cmd_to_priv(req, cmd)   (req)->cmd_flags = DNBD3_REQ_OP_SPECIAL | ((cmd) << REQ_FLAG_BITS)
#define dnbd3_connect(req)			  (req)->cmd_flags = DNBD3_REQ_OP_CONNECT | ((CMD_SELECT_IMAGE) << REQ_FLAG_BITS)
#define dnbd3_priv_to_cmd(req)        ((req)->cmd_flags >> REQ_FLAG_BITS)
#define dnbd3_sock_create(af,type,proto,sock) sock_create_kern(&init_net, (af) == HOST_IP4 ? AF_INET : AF_INET6, type, proto, sock)

#define KEEPALIVE_TIMER (jiffies + (HZ * TIMER_INTERVAL_KEEPALIVE_PACKET))
#define DISCOVERY_TIMER (jiffies + (HZ * TIMER_INTERVAL_PROBE_NORMAL))
#define REQUEST_TIMEOUT (HZ * SOCKET_TIMEOUT_CLIENT_DATA)

#define init_msghdr(h) do { \
        h.msg_name = NULL; \
        h.msg_namelen = 0; \
        h.msg_control = NULL; \
        h.msg_controllen = 0; \
        h.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; \
	} while (0)

static DECLARE_WAIT_QUEUE_HEAD(send_wq);
static volatile uint64_t send_wq_signal; //TODO make atomic atomic_64_t

static int dnbd3_socket_connect(struct dnbd3_device *dev, struct dnbd3_server *server);
static int __dnbd3_socket_connect(struct dnbd3_server * server, struct dnbd3_sock *sock);
static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server *server, struct dnbd3_sock *sock);


static void dnbd3_print_host(struct dnbd3_host_t *host, char *msg)
{
	if (host->type == HOST_IP4) {
		printk(KERN_INFO "dnbd3: %s %pI4:%d\n", msg, host->addr, host->port);
	} else {
		printk(KERN_INFO "dnbd3: %s [%pI6]:%d\n", msg, host->addr, host->port);
	}
}

static void dnbd3_print_server_list(struct dnbd3_device *dev)
{
	int i;
	dnbd3_print_host(&dev->initial_server.host, "initial server is");
	for (i = 0; i < NUMBER_SERVERS; i++) {
		if (dev->alt_servers[i].host.addr[0] != 0) {
			dnbd3_print_host(&dev->alt_servers[i].host, "alternative server is");
		}
	}
}


static inline uint64_t dnbd3_to_wq_signal(int minor, uint16_t dnbd3_cmd, uint16_t sock_nr) {
	return ((uint64_t) minor << 32) | ((uint32_t) dnbd3_cmd << 16) | sock_nr;
}


static uint64_t dnbd3_to_handle(uint32_t arg0, uint32_t arg1) {
	return ((uint64_t) arg0 << 32) | arg1;
}

static uint32_t dnbd3_arg0_from_handle(uint64_t handle) {
	return (uint32_t)(handle >> 32);
}

static uint32_t dnbd3_arg1_from_handle(uint64_t handle) {
	return (uint32_t) handle;
}

int dnbd3_send_request(struct dnbd3_sock *sock, struct request *req, struct dnbd3_cmd *cmd)
{
	dnbd3_request_t dnbd3_request;
	struct msghdr msg;
	struct kvec iov[2];
	size_t iov_num = 1;
	size_t send_len;
	int result;
	uint32_t tag;
	uint64_t handle;
	serialized_buffer_t payload_buffer;
	sock->pending = req;
	init_msghdr(msg);

	dnbd3_request.magic = dnbd3_packet_magic;

	switch (req_op(req)) {
	case REQ_OP_READ:
		printk(KERN_DEBUG "dnbd3: request operation read\n");
		dnbd3_request.cmd = CMD_GET_BLOCK;
		dnbd3_request.offset = blk_rq_pos(req) << 9; // *512
		dnbd3_request.size = blk_rq_bytes(req); // bytes left to complete entire request
		break;
	case DNBD3_REQ_OP_SPECIAL:
		printk(KERN_DEBUG "dnbd3: request operation special\n");
		dnbd3_request.cmd = dnbd3_priv_to_cmd(req);
		dnbd3_request.size = 0;
		break;
	case DNBD3_REQ_OP_CONNECT:
		printk(KERN_DEBUG "dnbd3: request operation connect to %s\n", sock->device->imgname);
		dnbd3_request.cmd = CMD_SELECT_IMAGE;
		serializer_reset_write(&payload_buffer);
		serializer_put_uint16(&payload_buffer, PROTOCOL_VERSION);
		serializer_put_string(&payload_buffer, sock->device->imgname);
		serializer_put_uint16(&payload_buffer, sock->device->rid);
		serializer_put_uint8(&payload_buffer, 0); // is_server = false
		iov[1].iov_base = &payload_buffer;
		dnbd3_request.size = iov[1].iov_len = serializer_get_written_length(&payload_buffer);
		iov_num = 2;
		break;
	default:
		return -EIO;
	}
	sock->cookie++;
	if (cmd != NULL) {
		cmd->cookie = sock->cookie;
		tag = blk_mq_unique_tag(req);
		handle = dnbd3_to_handle(tag, sock->cookie);// ((uint64_t) tag << 32) | sock->cookie;
	} else {
		handle = sock->cookie;
	}
	memcpy(&dnbd3_request.handle, &handle, sizeof(handle));

	fixup_request(dnbd3_request);
	iov[0].iov_base = &dnbd3_request;
	iov[0].iov_len = sizeof(dnbd3_request);
	send_len = iov_num == 1 ? sizeof(dnbd3_request) : iov[0].iov_len + iov[1].iov_len;
	if ((result = kernel_sendmsg(sock->sock, &msg, iov, iov_num, send_len)) != send_len) {
		printk(KERN_ERR "dnbd3: connection to server lost\n");
		goto error;
	}

	sock->pending = NULL;
	result = 0;
error:
	return result;
}


int dnbd3_send_request_blocking(struct dnbd3_sock *sock, int dnbd3_cmd)
{
	int result = 0;
	uint64_t handle;
	struct request *req = kmalloc(sizeof(struct request), GFP_KERNEL);
	printk(KERN_DEBUG "dnbd3: starting blocking request\n");
	if (!req) {
		printk(KERN_ERR "dnbd3: kmalloc failed\n");
		goto error;
	}

	switch (dnbd3_cmd) {
	case CMD_KEEPALIVE:
	case CMD_GET_SERVERS:
		dnbd3_cmd_to_priv(req, dnbd3_cmd);
		break;
	case CMD_SELECT_IMAGE:
		dnbd3_connect(req);
		break;
	default:
		printk(KERN_WARNING "dnbd3: unsupported command for blocking %d\n", dnbd3_cmd);
		result = -EINVAL;
		goto error;
	}

	mutex_lock(&sock->lock);
	result = dnbd3_send_request(sock, req, NULL);
	if (result) {
		mutex_unlock(&sock->lock);
		goto error;
	}
	send_wq_signal = 0;
	handle = dnbd3_to_wq_signal(sock->device->minor, dnbd3_cmd, sock->sock_nr);

	mutex_unlock(&sock->lock);

	printk(KERN_DEBUG "dnbd3: blocking request going to sleep for %d  wait for handle %llu\n", REQUEST_TIMEOUT, handle);
	if (wait_event_interruptible_timeout(send_wq, handle == send_wq_signal, REQUEST_TIMEOUT) <= 0) { // timeout or interrupt
		printk(KERN_WARNING "dndbd3: request timed out\n");
		result = -EIO;
		goto error;
	}
	printk(KERN_DEBUG "dnbd3: blocking request woke up with handle %llu\n", handle);


error:
	if (req) {
		kfree(req);
	}
	return result;
}

static int dnbd3_receive_cmd(struct dnbd3_sock *sock, dnbd3_reply_t *reply)
{
	int result;
	struct msghdr msg;
	struct kvec iov;
	init_msghdr(msg);
	iov.iov_base = reply;
	iov.iov_len = sizeof(dnbd3_reply_t);
	result = kernel_recvmsg(sock->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags);
	if (result <= 0) {
		return result;
	}
	fixup_reply(dnbd3_reply);

	// check error
	if (reply->magic != dnbd3_packet_magic) {
		printk(KERN_ERR "dnbd3: receive cmd wrong magic packet\n");
		return -EIO;
	}

	if (reply->cmd == 0) {
		printk(KERN_ERR "dnbd3: receive command was 0\n");
		return -EIO;
	}
	return result;
}

static int dnbd3_receive_cmd_get_block_mq(struct dnbd3_device *dev, struct dnbd3_sock *sock, dnbd3_reply_t *reply)
{
	struct dnbd3_cmd *cmd;
	struct msghdr msg;
	struct request *req = NULL;
	struct kvec iov;
	struct req_iterator iter;
	struct bio_vec bvec_inst;
	struct bio_vec *bvec = &bvec_inst;
	sigset_t blocked, oldset;
	void *kaddr;
	uint32_t tag, cookie;
	uint16_t hwq;
	int result = 0;
	uint64_t handle;
	init_msghdr(msg);

	printk(KERN_DEBUG "dnbd3: handle is %llu\n", reply->handle);
	memcpy(&handle, &reply->handle, sizeof(handle));
	cookie = dnbd3_arg1_from_handle(handle);
	tag = dnbd3_arg0_from_handle(handle);

	hwq = blk_mq_unique_tag_to_hwq(tag);
	if (hwq < dev->tag_set.nr_hw_queues) {
		req = blk_mq_tag_to_rq(dev->tag_set.tags[hwq], blk_mq_unique_tag_to_tag(tag));
	}
	if (!req || !blk_mq_request_started(req)) {
		dev_err(disk_to_dev(dev->disk), "unexpected reply (%d) %p\n", tag, req);
		return -EIO;
	}
	cmd = blk_mq_rq_to_pdu(req);

	mutex_lock(&cmd->lock);
	if (cmd->cookie != cookie) {
		dev_err(disk_to_dev(dev->disk), "double reply on req %p, cookie %u, handle cookie %u\n",
			req, cmd->cookie, cookie);
		mutex_unlock(&cmd->lock);
		return -EIO;
	}


	rq_for_each_segment(bvec_inst, req, iter) {
		siginitsetinv(&blocked, sigmask(SIGKILL));
		sigprocmask(SIG_SETMASK, &blocked, &oldset);

		kaddr = kmap(bvec->bv_page) + bvec->bv_offset;
		iov.iov_base = kaddr;
		iov.iov_len = bvec->bv_len;
		result = kernel_recvmsg(sock->sock, &msg, &iov, 1, bvec->bv_len, msg.msg_flags);
		if (result != bvec->bv_len) {
			kunmap(bvec->bv_page);
			sigprocmask(SIG_SETMASK, &oldset, NULL );
			printk(KERN_ERR "dnbd3: could not receive form net to block layer\n");
			mutex_unlock(&cmd->lock);
			return result;
		}
		kunmap(bvec->bv_page);

		sigprocmask(SIG_SETMASK, &oldset, NULL );
	}
	mutex_unlock(&cmd->lock);
	blk_mq_end_request(req, 0);
	return result;
}

static int dnbd3_receive_cmd_get_servers(struct dnbd3_device *dev, struct dnbd3_sock *sock, dnbd3_reply_t *reply)
{
	struct msghdr msg;
	struct kvec iov;
	/* return true if did not receive servers, not an error*/
	int result = 1;
	int count, remaining;
	init_msghdr(msg);

	printk(KERN_DEBUG "dnbd3: get servers received\n");
	mutex_lock(&dev->device_lock);
	if (!dev->use_server_provided_alts) {
		remaining = reply->size;
		goto consume_payload;
	}
	dev->new_servers_num = 0;
	count = MIN(NUMBER_SERVERS, reply->size / sizeof(dnbd3_server_entry_t));

	if (count != 0) {
		iov.iov_base = dev->new_servers;
		iov.iov_len = count * sizeof(dnbd3_server_entry_t);
		result = kernel_recvmsg(sock->sock, &msg, &iov, 1, (count * sizeof(dnbd3_server_entry_t)), msg.msg_flags);
		if (result <= 0) {
			printk(KERN_ERR "dnbd3: failed to receive get servers %d\n", result);
			return result;
		} else if (result != (count * sizeof(dnbd3_server_entry_t))) {
			printk(KERN_ERR "dnbd3: failed to get servers\n");
			mutex_unlock(&dev->device_lock);
			return -EIO;
		}
		dev->new_servers_num = count;
	}
	// If there were more servers than accepted, remove the remaining data from the socket buffer
	remaining = reply->size - (count * sizeof(dnbd3_server_entry_t));
consume_payload:
	while (remaining > 0) {
		count = MIN(sizeof(dnbd3_reply_t), remaining); // Abuse the reply struct as the receive buffer
		iov.iov_base = reply;
		iov.iov_len = count;
		result = kernel_recvmsg(sock->sock, &msg, &iov, 1, count, msg.msg_flags);
		if (result <= 0) {
			printk(KERN_ERR "dnbd3: failed to receive payload from get servers\n");
			mutex_unlock(&dev->device_lock);
			return result;
		}
	}
	mutex_unlock(&dev->device_lock);
	return result;
}
static int dnbd3_receive_cmd_latest_rid(struct dnbd3_device *dev, struct dnbd3_sock *sock, dnbd3_reply_t *reply)
{
	struct kvec iov;
	uint16_t rid;
	int result;
	struct msghdr msg;
	init_msghdr(msg);
	printk(KERN_DEBUG "dnbd3: latest rid received\n");

	if (reply->size != 2) {
		printk(KERN_ERR "dnbd3: failed to get latest rid, wrong size\n");
		return -EIO;
	}
	iov.iov_base = &rid;
	iov.iov_len = sizeof(rid);
	result = kernel_recvmsg(sock->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags);
	if (result <= 0) {
		printk(KERN_ERR "dnbd3: failed to receive latest rid\n");
		return result;
	}
	rid = net_order_16(rid);
	printk("Latest rid of %s is %d (currently using %d)\n", dev->imgname, (int)rid, (int)dev->rid);
	dev->update_available = (rid > dev->rid ? 1 : 0);
	return result;
}

static int dnbd3_receive_cmd_select_image(struct dnbd3_device *dev, struct dnbd3_sock *sock, dnbd3_reply_t *reply)
{
	struct kvec iov;
	uint16_t rid;
	char *name;
	int result;
	struct msghdr msg;
	serialized_buffer_t payload_buffer;
	uint64_t reported_size;
	init_msghdr(msg);
	printk(KERN_DEBUG "dnbd3: select image received\n");
	// receive reply payload
	iov.iov_base = &payload_buffer;
	iov.iov_len = reply->size;
	result = kernel_recvmsg(sock->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags);
	if (result <= 0) {
		printk(KERN_ERR "dnbd3: failed to receive select image %d\n", result);
		return result;
	} else if (result != reply->size) {
		printk(KERN_ERR "dnbd3: could not read CMD_SELECT_IMAGE payload on handshake, size is %d and should be%d\n",
				result, reply->size);
		return -EIO;
	}

	// handle/check reply payload
	serializer_reset_read(&payload_buffer, reply->size);
	sock->server->protocol_version = serializer_get_uint16(&payload_buffer);
	if (sock->server->protocol_version < MIN_SUPPORTED_SERVER) {
		printk(KERN_ERR "dnbd3: server version is lower than min supported version\n");
		return -EIO;
	}

	//TODO compare RID

	name = serializer_get_string(&payload_buffer);
	rid = serializer_get_uint16(&payload_buffer);
	if (dev->rid != rid && strcmp(name, dev->imgname) != 0) {
		printk(KERN_ERR "dnbd3: server offers image '%s', requested '%s'\n", name, dev->imgname);
		return -EIO;
	}

	reported_size = serializer_get_uint64(&payload_buffer);
	if (!dev->reported_size) {
		if (reported_size < 4096) {
			printk(KERN_ERR "dnbd3: reported size by server is < 4096\n");
			return -EIO;
		}
		dev->reported_size = reported_size;
		set_capacity(dev->disk, dev->reported_size >> 9); /* 512 Byte blocks */
	} else if (dev->reported_size != reported_size) {
		printk(KERN_ERR "dnbd3: reported size by server is %llu but should be %llu\n", reported_size, dev->reported_size);
		return -EIO;
	}
	return result;

}
static void dnbd3_receive_worker(struct work_struct *work)
{
	struct dnbd3_sock *sock = container_of(work, struct dnbd3_sock, receive_worker);
	struct dnbd3_device *dev = sock->device;
	dnbd3_reply_t dnbd3_reply;
	uint64_t handle;
	int result;

	while(sock->sock && sock->server) {
		result = dnbd3_receive_cmd(sock, &dnbd3_reply);
//				kernel_recvmsg(sock->sock, &msg, &iov, 1, sizeof(dnbd3_reply), msg.msg_flags);
		if (result == -EAGAIN) {
			continue;
		} else if (result <= 0) {
			printk(KERN_ERR "dnbd3: connection to server lost %d\n", result);
			goto error;

		}

		switch (dnbd3_reply.cmd) {
		case CMD_GET_BLOCK:
			result = dnbd3_receive_cmd_get_block_mq(dev, sock, &dnbd3_reply);
			if (result <= 0) {
				printk(KERN_ERR "dnbd3: receive cmd get block mq failed %d\n", result);
				goto error;
			}
			break;
		case CMD_GET_SERVERS:
			result = dnbd3_receive_cmd_get_servers(dev, sock, &dnbd3_reply);
			if (result <= 0) {
				printk(KERN_ERR "dnbd3: receive cmd get servers failed %d\n", result);
				goto error;
			}
			break;
		case CMD_LATEST_RID:
			result = dnbd3_receive_cmd_latest_rid(dev, sock, &dnbd3_reply);
			if (result <= 0) {
				printk(KERN_ERR "dnbd3: receive cmd latest rid failed %d\n", result);
				goto error;
			}
			break;
		case CMD_KEEPALIVE:
			if (dnbd3_reply.size != 0) {
				printk(KERN_ERR "dnbd3: got keep alive packet with payload\n");
				goto error;
			}
			printk(KERN_DEBUG "dnbd3: keep alive received\n");
			break;
		case CMD_SELECT_IMAGE:
			result = dnbd3_receive_cmd_select_image(dev, sock, &dnbd3_reply);
			if (result <= 0) {
				printk(KERN_ERR "dnbd3: receive cmd select image failed %d\n", result);
				goto error;
			}
			break;
		default:
			printk(KERN_WARNING "dnbd3: Unknown command (Receive)\n");
			break;
		}
error:
		handle = dnbd3_to_wq_signal(dev->minor, dnbd3_reply.cmd, sock->sock_nr);
		printk(KERN_DEBUG "dnbd3: try to wake up queue with handle %llu\n", handle);
		send_wq_signal = handle;
		wake_up_interruptible(&send_wq);
		if (result == 0) {
			printk(KERN_INFO "dnbd3: result is 0, socket seems to be down\n");
			sock->panic = 1;
//			dnbd3_socket_disconnect(dev, NULL, sock, false);//TODO use panic or something or start worker to reconnect?
			break; //the socket seems to be down
		} else if (result < 0) {
			sock->server->failures++; // discovery takes care of to many failures
			printk(KERN_WARNING "dnbd3: receive error happened %d, total failures %d\n", result, sock->server->failures);
		}
		printk(KERN_DEBUG "dnbd3: receive completed, waiting for next receive\n");
	}
	printk(KERN_DEBUG "dnbd3: receive work queue is stopped\n");
}


static void dnbd3_timer(struct timer_list *arg)
{
	struct dnbd3_device *dev = container_of(arg, struct dnbd3_device, timer);
	int i;

	queue_work(dnbd3_wq, &dev->panic_worker);

	if (dev->timer_count % TIMER_INTERVAL_KEEPALIVE_PACKET == 0) {
		for (i = 0; i < NUMBER_CONNECTIONS; i++) {
			if (dev->socks[i].sock && dev->socks[i].server) {
				queue_work(dnbd3_wq, &dev->socks[i].keepalive_worker);
			}
		}
	}
	if (dev->timer_count % TIMER_INTERVAL_PROBE_NORMAL == 0) {
		queue_work(dnbd3_wq, &dev->discovery_worker);
	}


	dev->timer_count++;
	dev->timer.expires = jiffies + HZ;
	add_timer(&dev->timer);
}


static void dnbd3_keepalive_worker(struct work_struct *work)
{
	struct dnbd3_sock *sock = container_of(work, struct dnbd3_sock, keepalive_worker);
	printk(KERN_DEBUG "dnbd3: starting keepalive worker\n");
	dnbd3_send_request_blocking(sock, CMD_KEEPALIVE);
}

static struct dnbd3_server *dnbd3_find_best_alt_server(struct dnbd3_device *dev) {
	int i, j;
	uint64_t rtt = 0;
	uint64_t best_rtt = RTT_UNREACHABLE;
	struct dnbd3_server *best_alt_server = NULL;
	for (i = 0; i < NUMBER_SERVERS; i++) {
		rtt = (dev->alt_servers[i].rtts[0] + dev->alt_servers[i].rtts[1]
				+ dev->alt_servers[i].rtts[2] + dev->alt_servers[i].rtts[3]) / 4;
		if (rtt < best_rtt) {
			best_alt_server = &dev->alt_servers[i];
			for (j = 0; j < NUMBER_CONNECTIONS; j++) {
				if (best_alt_server == dev->socks[j].server) {
					best_alt_server = NULL; // found already connected server
					break;
				}
			}
		}
	}
	return best_alt_server;
}

static void dnbd3_panic_worker(struct work_struct *work)
{
	struct dnbd3_device *dev = container_of(work, struct dnbd3_device, panic_worker);
	struct dnbd3_sock *panicked_sock = NULL;
	struct dnbd3_server *new_server, *panicked_server;
	int i;
	int sock_alive = 0;
	for (i = 0; i < NUMBER_CONNECTIONS; i++) {
		if (dev->socks[i].panic) {
			panicked_sock = &dev->socks[i];
		} else if (dev->socks[i].sock && dev->socks[i].server) {
			sock_alive++;
		}
	}
	if (panicked_sock) {
		printk(KERN_WARNING "dnbd3: socket %d panicked, connections still alive %d\n", panicked_sock->sock_nr, sock_alive);
		panicked_server = panicked_sock->server;
		dnbd3_socket_disconnect(dev, panicked_server, panicked_sock);

		new_server = dnbd3_find_best_alt_server(dev);
		if (new_server != NULL && new_server != panicked_server) {
			printk(KERN_INFO "dnbd3: found replacement server\n");
			dnbd3_socket_connect(dev, new_server);
		} else if (sock_alive > 0) {
			printk(KERN_INFO "dnbd3: found no replacement server but still connected to %d servers\n", sock_alive);
		} else {
			printk(KERN_ERR "dnbd3: could not reconnect to server\n");
		}
	} else if (sock_alive == 0) {
		new_server = dnbd3_find_best_alt_server(dev);
		if (new_server != NULL) {
			printk(KERN_INFO "dnbd3: reconnect to server\n");
			dnbd3_socket_connect(dev, new_server);
		}
	}
}


static void dnbd3_discovery_worker(struct work_struct *work)
{
	struct dnbd3_device *dev = container_of(work, struct dnbd3_device, discovery_worker);
	struct dnbd3_sock *sock = NULL;
	int i, j;
	struct dnbd3_server *existing_server, *free_server, *failed_server;
	dnbd3_server_entry_t *new_server;
	struct kvec iov;
	struct timeval start, end;
	dnbd3_request_t dnbd3_request;
	dnbd3_reply_t dnbd3_reply;
	struct msghdr msg;
	char *buf;
	struct request *req = NULL;
	uint64_t rtt;
	serialized_buffer_t *payload;
	printk(KERN_DEBUG "dnbd3: starting discovery worker\n");

	dnbd3_send_request_blocking(&dev->socks[0], CMD_GET_SERVERS);

	printk(KERN_DEBUG "dnbd3: new server num is %d\n", dev->new_servers_num);
	if (dev->new_servers_num) {
		mutex_lock(&dev->device_lock);


		for (i = 0; i < dev->new_servers_num; i++) {
			new_server = &dev->new_servers[i];
			if (new_server->host.type == HOST_IP4 || new_server->host.type == HOST_IP6) {
				existing_server = NULL;
				free_server = NULL;
				failed_server = NULL;

				// find servers in alt servers
				for (j = 0; j < NUMBER_SERVERS; j++) {
					if ((new_server->host.type == dev->alt_servers[j].host.type)
					   && (new_server->host.port == dev->alt_servers[j].host.port)
					   && (0 == memcmp(new_server->host.addr, dev->alt_servers[j].host.addr,
							   (new_server->host.type == HOST_IP4 ? 4 : 16)))) 	{

						existing_server = &dev->alt_servers[j];
					} else if (dev->alt_servers[j].host.type == 0) {
						free_server = &dev->alt_servers[j];
					} else if (dev->alt_servers[j].failures > 20) {
						failed_server = &dev->alt_servers[j];
					}
				}

				if (existing_server) {
					if (new_server->failures == 1) { // remove is requested
						dnbd3_print_host(&existing_server->host, "remove server");
						dnbd3_socket_disconnect(dev, existing_server, NULL); // TODO what to do when only one connection?
						existing_server->host.type = 0;
					}
//					existing_server->failures = 0; // reset failure count
					continue;
				} else if (free_server) {
					free_server->host = new_server->host;
				} else if (failed_server) {
					failed_server->host = new_server->host;
					free_server = failed_server;
				} else {
					//no server found to replace
					continue;
				}
				dnbd3_print_host(&free_server->host, "got new alt server");
				free_server->failures = 0;
				free_server->protocol_version = 0;
				free_server->rtts[0] = free_server->rtts[1] = free_server->rtts[2] = free_server->rtts[3] = RTT_UNREACHABLE;
			}
		}
		dev->new_servers_num = 0;
		mutex_unlock(&dev->device_lock);
	}
	buf = kmalloc(RTT_BLOCK_SIZE, GFP_KERNEL);
	if (!buf) {
		printk(KERN_ERR "dnbd3: kmalloc failed\n");
		goto error;
	}
	payload = (serialized_buffer_t *)buf;
	req = kmalloc(sizeof(struct request), GFP_KERNEL);
	if (!req) {
		printk(KERN_ERR "dnbd3: kmalloc failed\n");
		goto error;
	}
	sock = kmalloc(sizeof(struct dnbd3_sock), GFP_KERNEL);
	if (!sock) {
		printk(KERN_ERR "dnbd3: kmalloc failed\n");
		goto error;
	}
	mutex_init(&sock->lock);
	mutex_lock(&sock->lock);
	// measure rtt for all alt servers
	for (i = 0; i < NUMBER_SERVERS; i++) {
		existing_server = &dev->alt_servers[i];
		if (existing_server->host.type) {
			sock->sock = NULL;
			sock->device = dev;
			sock->server = existing_server;
			if (__dnbd3_socket_connect(existing_server, sock)) {
				printk(KERN_ERR "dnbd3: socket connect failed in rtt measurement\n");
				goto rtt_error;
			}
			dnbd3_connect(req);
			if (dnbd3_send_request(sock, req, NULL)) {
				printk(KERN_ERR "dnbd3: request select image failed in rtt measurement\n");
				goto rtt_error;
			}

			if (dnbd3_receive_cmd(sock, &dnbd3_reply) <= 0) {
				printk(KERN_ERR "dnbd3: receive select image failed in rtt measurement\n");
				goto rtt_error;

			}
			if (dnbd3_reply.magic != dnbd3_packet_magic || dnbd3_reply.cmd != CMD_SELECT_IMAGE || dnbd3_reply.size < 4) {
				printk(KERN_ERR "dnbd3: receive select image wrong header in rtt measurement\n");
				goto rtt_error;
			}

			if (dnbd3_receive_cmd_select_image(dev, sock, &dnbd3_reply) <= 0) {
				printk(KERN_ERR "dnbd3: receive data select image failed in rtt measurement\n");
				goto rtt_error;
			}

			// Request block
			dnbd3_request.cmd = CMD_GET_BLOCK;
			// Do *NOT* pick a random block as it has proven to cause severe
			// cache thrashing on the server
			dnbd3_request.offset = 0;
			dnbd3_request.size = RTT_BLOCK_SIZE;
			fixup_request(dnbd3_request);
			iov.iov_base = &dnbd3_request;
			iov.iov_len = sizeof(dnbd3_request);

			init_msghdr(msg);
			// start rtt measurement
			do_gettimeofday(&start);

			if (kernel_sendmsg(sock->sock, &msg, &iov, 1, sizeof(dnbd3_request)) <= 0) {
				printk(KERN_ERR "dnbd3: request test block failed in rtt measurement\n");
				goto rtt_error;
			}
			// receive net reply
			iov.iov_base = &dnbd3_reply;
			iov.iov_len = sizeof(dnbd3_reply);
			if ((j = kernel_recvmsg(sock->sock, &msg, &iov, 1, sizeof(dnbd3_reply), msg.msg_flags)) != sizeof(dnbd3_reply)) {
				printk(KERN_ERR "dnbd3: receive header test block failed in rtt measurement %d %ld\n", j, sizeof(dnbd3_reply));
				goto rtt_error;
			}
			fixup_reply(dnbd3_reply);
			if (dnbd3_reply.magic != dnbd3_packet_magic|| dnbd3_reply.cmd != CMD_GET_BLOCK || dnbd3_reply.size != RTT_BLOCK_SIZE) {
				printk(KERN_ERR "dnbd3: receive header cmd test block failed in rtt measurement\n");
				goto rtt_error;
			}

			// receive data
			iov.iov_base = buf;
			iov.iov_len = RTT_BLOCK_SIZE;
			if (kernel_recvmsg(sock->sock, &msg, &iov, 1, dnbd3_reply.size, msg.msg_flags) != RTT_BLOCK_SIZE) {
				printk(KERN_ERR "dnbd3: receive test block failed in rtt measurement\n");
				goto rtt_error;
			}

			do_gettimeofday(&end); // end rtt measurement

			rtt = (uint64_t)((end.tv_sec - start.tv_sec) * 1000000ull + (end.tv_usec - start.tv_usec));

			printk(KERN_DEBUG "dnbd3: new rrt for %pI4 is %llu\n", existing_server->host.addr, rtt);

rtt_error:
			if (sock->sock) {
				kernel_sock_shutdown(sock->sock, SHUT_RDWR);
				sock->server = NULL;
			}

			if (sock->sock) {
				sock_release(sock->sock);
				sock->sock = NULL;
			}
		}
	}
	mutex_unlock(&sock->lock);
	mutex_destroy(&sock->lock);
error:
	if (buf) {
		kfree(buf);
		buf = NULL;
	}
	if (req) {
		kfree(req);
		req = NULL;
	}
	if (sock) {
		kfree(sock);
		sock = NULL;
	}
}

static int __dnbd3_socket_connect(struct dnbd3_server *server, struct dnbd3_sock *sock)
{
	int result = 0;
	struct timeval timeout;

	if (server->host.port == 0 || server->host.type == 0) {
		printk(KERN_ERR "dnbd3: host or port not set\n");
		return -EIO;
	}
	if (sock->sock) {
		printk(KERN_WARNING "dnbd3: socket already connected\n");
		return -EIO;
	}

	timeout.tv_sec = SOCKET_TIMEOUT_CLIENT_DATA;
	timeout.tv_usec = 0;

	if ((result = dnbd3_sock_create(server->host.type, SOCK_STREAM, IPPROTO_TCP, &sock->sock)) < 0) {
		printk(KERN_ERR "dnbd3: could not create socket\n");
		goto error;
	}

	kernel_setsockopt(sock->sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
	kernel_setsockopt(sock->sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
	sock->sock->sk->sk_allocation = GFP_NOIO;
	if (server->host.type == HOST_IP4) {
		struct sockaddr_in sin;
		memset(&sin, 0, sizeof(sin));
		sin.sin_family = AF_INET;
		memcpy(&(sin.sin_addr), server->host.addr, 4);
		sin.sin_port = server->host.port;
		if ((result = kernel_connect(sock->sock, (struct sockaddr *)&sin, sizeof(sin), 0)) != 0) {
			printk(KERN_ERR "dnbd3: connection to host failed (ipv4)\n");
			goto error;
		}
	} else {
		struct sockaddr_in6 sin;
		memset(&sin, 0, sizeof(sin));
		sin.sin6_family = AF_INET6;
		memcpy(&(sin.sin6_addr), server->host.addr, 16);
		sin.sin6_port = server->host.port;
		if ((result = kernel_connect(sock->sock, (struct sockaddr *)&sin, sizeof(sin), 0)) != 0){
			printk(KERN_ERR "dnbd3: connection to host failed (ipv6)\n");
			goto error;
		}
	}

	return 0;
error:
	if (sock->sock) {
		sock_release(sock->sock);
		sock->sock = NULL;
	}
	return result;
}

static int dnbd3_socket_connect(struct dnbd3_device *dev, struct dnbd3_server *server)
{
	int i;
	int sock_alive = 0;
	int result = -EIO;
	struct dnbd3_sock *sock = NULL;
	for (i = 0; i < NUMBER_CONNECTIONS; i++) {
		if (!dev->socks[i].sock) {
			sock = &dev->socks[i];
			break;
		}
	}
	if (sock == NULL) {
		printk(KERN_WARNING "dnbd3: could not connect to socket, to many connections\n");
		return -EIO;
	}
	sock->server = server;

	printk(KERN_DEBUG "dnbd3: socket connect device %i\n", dev->minor);


	mutex_init(&sock->lock);
	mutex_lock(&sock->lock);
	__dnbd3_socket_connect(server, sock);
	mutex_unlock(&sock->lock);
	if (!sock->sock) {
		printk(KERN_DEBUG "dnbd3: socket is not connected\n");
		result = -EIO;
		goto error;
	}

	// start the receiver
	INIT_WORK(&sock->receive_worker, dnbd3_receive_worker);
	queue_work(dnbd3_wq, &sock->receive_worker);

	result = dnbd3_send_request_blocking(sock, CMD_SELECT_IMAGE);
	if (result) {
		printk(KERN_ERR "dnbd3: connection to image %s failed\n", dev->imgname);
		goto error;

	}

	printk(KERN_DEBUG "dnbd3: connected to image %s, filesize %llu\n", dev->imgname, dev->reported_size);

	INIT_WORK(&sock->keepalive_worker, dnbd3_keepalive_worker);

	for (i = 0; i < NUMBER_CONNECTIONS; i++) {
		if (dev->socks[i].sock  && dev->socks[i].server) {
			sock_alive++;
		}
	}
	blk_mq_update_nr_hw_queues(&dev->tag_set, sock_alive);
	return 0;
error:
	if (sock->sock) {
		sock_release(sock->sock);
		sock->sock = NULL;
	}
	return result;
}


static int dnbd3_socket_disconnect(struct dnbd3_device *dev, struct dnbd3_server *server, struct dnbd3_sock *sock)
{
	int i;
	int sock_alive = 0;
	for (i = 0; i < NUMBER_CONNECTIONS; i++) {
		if (sock == NULL && dev->socks[i].server == server) {
			sock = &dev->socks[i];
		}
		if (dev->socks[i].sock && dev->socks[i].server) {
			sock_alive++;
		}
	}
	if (!sock || !sock->sock) {
		printk(KERN_WARNING "dnbd3: could not find socket to disconnect\n");
		return -EIO;
	}
	blk_mq_update_nr_hw_queues(&dev->tag_set, sock_alive - 1);
	if (sock_alive <= 1) {
		printk(KERN_INFO "dnbd3: shutting down last socket and stopping discovery\n");
		del_timer_sync(&dev->timer);
		dev->timer_count = 0;
		cancel_work_sync(&dev->discovery_worker);
//		cancel_work_sync(&dev->panic_worker);  // do not wait for panic_worker, probably we are called from panic_worker

	}
	cancel_work_sync(&sock->keepalive_worker);

	printk(KERN_DEBUG "dnbd3: socket disconnect device %i\n", dev->minor);
	mutex_lock(&sock->lock);

	/*
	 * Important sequence to shut down socket
	 * 1. kernel_sock_shutdown
	 *      socket shutdown, receiver which hangs in kernel_recvmsg returns 0
	 * 2. cancel_work_sync(receiver)
	 *      wait for the receiver to finish, so the socket is not usesd anymore
	 * 3. sock_release
	 *      release the socket and set to NULL
	 */
	if (sock->sock) {
		kernel_sock_shutdown(sock->sock, SHUT_RDWR);
		sock->server = NULL;
	}
	mutex_unlock(&sock->lock);
	mutex_destroy(&sock->lock);

	printk(KERN_DEBUG "dnbd3: cancel receiver work device %i\n", dev->minor);
	cancel_work_sync(&sock->receive_worker);

	if (sock->sock) {
		sock_release(sock->sock);
		sock->sock = NULL;
	}
	sock->panic = 0;
	return 0;
}

int dnbd3_net_disconnect(struct dnbd3_device *dev)
{
	int i;
	int result = 0;

	for (i = 0; i < NUMBER_CONNECTIONS; i++) {
		if (dev->socks[i].sock) {
			if (dnbd3_socket_disconnect(dev, NULL, &dev->socks[i])) {
				result = -EIO;
			}
		}
	}
	return result;
}


int dnbd3_net_connect(struct dnbd3_device *dev)
{
	// TODO decide which socket to connect
	int result;
	if (dnbd3_socket_connect(dev, &dev->alt_servers[0]) == 0) {
		dnbd3_print_server_list(dev);

		INIT_WORK(&dev->discovery_worker, dnbd3_discovery_worker);
		INIT_WORK(&dev->panic_worker, dnbd3_panic_worker);
		timer_setup(&dev->timer, dnbd3_timer, 0);
		dev->timer.expires = jiffies + HZ;
		add_timer(&dev->timer);

		result = 0;
	} else {
		printk(KERN_ERR "dnbd3: failed to connect to initial server\n");
		result = -ENOENT;
		dev->imgname = NULL;
		dev->socks[0].server = NULL;
	}
	return result;
}