summaryrefslogblamecommitdiffstats
path: root/fs/ocfs2/dlm/dlmmaster.c
blob: 940be4c13b1f09ff4703662f007692a6d4b81e89 (plain) (tree)
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647

















































                                                                    
                      

































































































































                                                                                                           



























































                                                                            

                                                                      




































































































































































































































































































































                                                                                
                                                 







































                                                                    
                                         






































































                                                                      
                                      




















































































                                                                               











                                                                            
















                                                                     
































                                                                                












                                                            








                                                                             


































































                                                                        

                                                                        
                                            









                                                                                       




























































































































































































                                                                                



                                                                               





























































































































































































                                                                                
                                             





                                                                                    
                                



































                                                                               











                                                                    
                                            














































                                                                               
                                                              
                                                               




                                                                    





































                                                                            
                                                                 
                                                    

















                                                                          



                                                                                             




                                                                     

                                                                

                                                         











                                                                      

















                                                                                   





















                                                                           
                     

                                            

                     






























                                                                               




                                                                     


                 


                           




















                                                                                

                               

















































































































                                                                               

                                  

                                          











                                                                                       





























                                                                               
                


                                     




                                                                               































                                                                                     



                                                                    





























































                                                                            






















































                                                                                






























































































































































































































                                                                              








                                                                                


























































































































































































































































































































































































































































































                                                                                      


                                                                            





















































                                                                      


                                                                            






























































































































                                                                                
/* -*- mode: c; c-basic-offset: 8; -*-
 * vim: noexpandtab sw=8 ts=8 sts=0:
 *
 * dlmmod.c
 *
 * standalone DLM module
 *
 * Copyright (C) 2004 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 *
 */


#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/utsname.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/spinlock.h>
#include <linux/delay.h>


#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"

#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdebug.h"
#include "dlmdomain.h"

#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
#include "cluster/masklog.h"

enum dlm_mle_type {
	DLM_MLE_BLOCK,
	DLM_MLE_MASTER,
	DLM_MLE_MIGRATION
};

struct dlm_lock_name
{
	u8 len;
	u8 name[DLM_LOCKID_NAME_MAX];
};

struct dlm_master_list_entry
{
	struct list_head list;
	struct list_head hb_events;
	struct dlm_ctxt *dlm;
	spinlock_t spinlock;
	wait_queue_head_t wq;
	atomic_t woken;
	struct kref mle_refs;
	unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
	unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
	unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
	unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
	u8 master;
	u8 new_master;
	enum dlm_mle_type type;
	struct o2hb_callback_func mle_hb_up;
	struct o2hb_callback_func mle_hb_down;
	union {
		struct dlm_lock_resource *res;
		struct dlm_lock_name name;
	} u;
};

static void dlm_mle_node_down(struct dlm_ctxt *dlm,
			      struct dlm_master_list_entry *mle,
			      struct o2nm_node *node,
			      int idx);
static void dlm_mle_node_up(struct dlm_ctxt *dlm,
			    struct dlm_master_list_entry *mle,
			    struct o2nm_node *node,
			    int idx);

static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
				unsigned int namelen, void *nodemap,
				u32 flags);

static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
				struct dlm_master_list_entry *mle,
				const char *name,
				unsigned int namelen)
{
	struct dlm_lock_resource *res;

	if (dlm != mle->dlm)
		return 0;

	if (mle->type == DLM_MLE_BLOCK ||
	    mle->type == DLM_MLE_MIGRATION) {
		if (namelen != mle->u.name.len ||
    	    	    memcmp(name, mle->u.name.name, namelen)!=0)
			return 0;
	} else {
		res = mle->u.res;
		if (namelen != res->lockname.len ||
		    memcmp(res->lockname.name, name, namelen) != 0)
			return 0;
	}
	return 1;
}

#if 0
/* Code here is included but defined out as it aids debugging */

void dlm_print_one_mle(struct dlm_master_list_entry *mle)
{
	int i = 0, refs;
	char *type;
	char attached;
	u8 master;
	unsigned int namelen;
	const char *name;
	struct kref *k;

	k = &mle->mle_refs;
	if (mle->type == DLM_MLE_BLOCK)
		type = "BLK";
	else if (mle->type == DLM_MLE_MASTER)
		type = "MAS";
	else
		type = "MIG";
	refs = atomic_read(&k->refcount);
	master = mle->master;
	attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');

	if (mle->type != DLM_MLE_MASTER) {
		namelen = mle->u.name.len;
		name = mle->u.name.name;
	} else {
		namelen = mle->u.res->lockname.len;
		name = mle->u.res->lockname.name;
	}

	mlog(ML_NOTICE, "  #%3d: %3s  %3d  %3u   %3u %c    (%d)%.*s\n",
		  i, type, refs, master, mle->new_master, attached,
		  namelen, namelen, name);
}

static void dlm_dump_mles(struct dlm_ctxt *dlm)
{
	struct dlm_master_list_entry *mle;
	struct list_head *iter;
	
	mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
	mlog(ML_NOTICE, "  ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
	spin_lock(&dlm->master_lock);
	list_for_each(iter, &dlm->master_list) {
		mle = list_entry(iter, struct dlm_master_list_entry, list);
		dlm_print_one_mle(mle);
	}
	spin_unlock(&dlm->master_lock);
}

int dlm_dump_all_mles(const char __user *data, unsigned int len)
{
	struct list_head *iter;
	struct dlm_ctxt *dlm;

	spin_lock(&dlm_domain_lock);
	list_for_each(iter, &dlm_domains) {
		dlm = list_entry (iter, struct dlm_ctxt, list);
		mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
		dlm_dump_mles(dlm);
	}
	spin_unlock(&dlm_domain_lock);
	return len;
}
EXPORT_SYMBOL_GPL(dlm_dump_all_mles);

#endif  /*  0  */


static kmem_cache_t *dlm_mle_cache = NULL;


static void dlm_mle_release(struct kref *kref);
static void dlm_init_mle(struct dlm_master_list_entry *mle,
			enum dlm_mle_type type,
			struct dlm_ctxt *dlm,
			struct dlm_lock_resource *res,
			const char *name,
			unsigned int namelen);
static void dlm_put_mle(struct dlm_master_list_entry *mle);
static void __dlm_put_mle(struct dlm_master_list_entry *mle);
static int dlm_find_mle(struct dlm_ctxt *dlm,
			struct dlm_master_list_entry **mle,
			char *name, unsigned int namelen);

static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to);


static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
				     struct dlm_lock_resource *res,
				     struct dlm_master_list_entry *mle,
				     int *blocked);
static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
				    struct dlm_lock_resource *res,
				    struct dlm_master_list_entry *mle,
				    int blocked);
static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
				 struct dlm_lock_resource *res,
				 struct dlm_master_list_entry *mle,
				 struct dlm_master_list_entry **oldmle,
				 const char *name, unsigned int namelen,
				 u8 new_master, u8 master);

static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
				    struct dlm_lock_resource *res);
static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
				      struct dlm_lock_resource *res);
static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
				       struct dlm_lock_resource *res,
				       u8 target);
static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
				       struct dlm_lock_resource *res);


int dlm_is_host_down(int errno)
{
	switch (errno) {
		case -EBADF:
		case -ECONNREFUSED:
		case -ENOTCONN:
		case -ECONNRESET:
		case -EPIPE:
		case -EHOSTDOWN:
		case -EHOSTUNREACH:
		case -ETIMEDOUT:
		case -ECONNABORTED:
		case -ENETDOWN:
		case -ENETUNREACH:
		case -ENETRESET:
		case -ESHUTDOWN:
		case -ENOPROTOOPT:
		case -EINVAL:   /* if returned from our tcp code,
				   this means there is no socket */
			return 1;
	}
	return 0;
}


/*
 * MASTER LIST FUNCTIONS
 */


/*
 * regarding master list entries and heartbeat callbacks:
 *
 * in order to avoid sleeping and allocation that occurs in
 * heartbeat, master list entries are simply attached to the
 * dlm's established heartbeat callbacks.  the mle is attached
 * when it is created, and since the dlm->spinlock is held at
 * that time, any heartbeat event will be properly discovered
 * by the mle.  the mle needs to be detached from the
 * dlm->mle_hb_events list as soon as heartbeat events are no
 * longer useful to the mle, and before the mle is freed.
 *
 * as a general rule, heartbeat events are no longer needed by
 * the mle once an "answer" regarding the lock master has been
 * received.
 */
static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
					      struct dlm_master_list_entry *mle)
{
	assert_spin_locked(&dlm->spinlock);

	list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
}


static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
					      struct dlm_master_list_entry *mle)
{
	if (!list_empty(&mle->hb_events))
		list_del_init(&mle->hb_events);
}


static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
					    struct dlm_master_list_entry *mle)
{
	spin_lock(&dlm->spinlock);
	__dlm_mle_detach_hb_events(dlm, mle);
	spin_unlock(&dlm->spinlock);
}

/* remove from list and free */
static void __dlm_put_mle(struct dlm_master_list_entry *mle)
{
	struct dlm_ctxt *dlm;
	dlm = mle->dlm;

	assert_spin_locked(&dlm->spinlock);
	assert_spin_locked(&dlm->master_lock);
	BUG_ON(!atomic_read(&mle->mle_refs.refcount));

	kref_put(&mle->mle_refs, dlm_mle_release);
}


/* must not have any spinlocks coming in */
static void dlm_put_mle(struct dlm_master_list_entry *mle)
{
	struct dlm_ctxt *dlm;
	dlm = mle->dlm;

	spin_lock(&dlm->spinlock);
	spin_lock(&dlm->master_lock);
	__dlm_put_mle(mle);
	spin_unlock(&dlm->master_lock);
	spin_unlock(&dlm->spinlock);
}

static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
{
	kref_get(&mle->mle_refs);
}

static void dlm_init_mle(struct dlm_master_list_entry *mle,
			enum dlm_mle_type type,
			struct dlm_ctxt *dlm,
			struct dlm_lock_resource *res,
			const char *name,
			unsigned int namelen)
{
	assert_spin_locked(&dlm->spinlock);

	mle->dlm = dlm;
	mle->type = type;
	INIT_LIST_HEAD(&mle->list);
	INIT_LIST_HEAD(&mle->hb_events);
	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
	spin_lock_init(&mle->spinlock);
	init_waitqueue_head(&mle->wq);
	atomic_set(&mle->woken, 0);
	kref_init(&mle->mle_refs);
	memset(mle->response_map, 0, sizeof(mle->response_map));
	mle->master = O2NM_MAX_NODES;
	mle->new_master = O2NM_MAX_NODES;

	if (mle->type == DLM_MLE_MASTER) {
		BUG_ON(!res);
		mle->u.res = res;
	} else if (mle->type == DLM_MLE_BLOCK) {
		BUG_ON(!name);
		memcpy(mle->u.name.name, name, namelen);
		mle->u.name.len = namelen;
	} else /* DLM_MLE_MIGRATION */ {
		BUG_ON(!name);
		memcpy(mle->u.name.name, name, namelen);
		mle->u.name.len = namelen;
	}

	/* copy off the node_map and register hb callbacks on our copy */
	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
	clear_bit(dlm->node_num, mle->vote_map);
	clear_bit(dlm->node_num, mle->node_map);

	/* attach the mle to the domain node up/down events */
	__dlm_mle_attach_hb_events(dlm, mle);
}


/* returns 1 if found, 0 if not */
static int dlm_find_mle(struct dlm_ctxt *dlm,
			struct dlm_master_list_entry **mle,
			char *name, unsigned int namelen)
{
	struct dlm_master_list_entry *tmpmle;
	struct list_head *iter;

	assert_spin_locked(&dlm->master_lock);

	list_for_each(iter, &dlm->master_list) {
		tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
		if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
			continue;
		dlm_get_mle(tmpmle);
		*mle = tmpmle;
		return 1;
	}
	return 0;
}

void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
{
	struct dlm_master_list_entry *mle;
	struct list_head *iter;

	assert_spin_locked(&dlm->spinlock);
	
	list_for_each(iter, &dlm->mle_hb_events) {
		mle = list_entry(iter, struct dlm_master_list_entry, 
				 hb_events);
		if (node_up)
			dlm_mle_node_up(dlm, mle, NULL, idx);
		else
			dlm_mle_node_down(dlm, mle, NULL, idx);
	}
}

static void dlm_mle_node_down(struct dlm_ctxt *dlm,
			      struct dlm_master_list_entry *mle,
			      struct o2nm_node *node, int idx)
{
	spin_lock(&mle->spinlock);

	if (!test_bit(idx, mle->node_map))
		mlog(0, "node %u already removed from nodemap!\n", idx);
	else
		clear_bit(idx, mle->node_map);

	spin_unlock(&mle->spinlock);
}

static void dlm_mle_node_up(struct dlm_ctxt *dlm,
			    struct dlm_master_list_entry *mle,
			    struct o2nm_node *node, int idx)
{
	spin_lock(&mle->spinlock);

	if (test_bit(idx, mle->node_map))
		mlog(0, "node %u already in node map!\n", idx);
	else
		set_bit(idx, mle->node_map);

	spin_unlock(&mle->spinlock);
}


int dlm_init_mle_cache(void)
{
	dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
					  sizeof(struct dlm_master_list_entry),
					  0, SLAB_HWCACHE_ALIGN,
					  NULL, NULL);
	if (dlm_mle_cache == NULL)
		return -ENOMEM;
	return 0;
}

void dlm_destroy_mle_cache(void)
{
	if (dlm_mle_cache)
		kmem_cache_destroy(dlm_mle_cache);
}

static void dlm_mle_release(struct kref *kref)
{
	struct dlm_master_list_entry *mle;
	struct dlm_ctxt *dlm;

	mlog_entry_void();

	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
	dlm = mle->dlm;

	if (mle->type != DLM_MLE_MASTER) {
		mlog(0, "calling mle_release for %.*s, type %d\n",
		     mle->u.name.len, mle->u.name.name, mle->type);
	} else {
		mlog(0, "calling mle_release for %.*s, type %d\n",
		     mle->u.res->lockname.len,
		     mle->u.res->lockname.name, mle->type);
	}
	assert_spin_locked(&dlm->spinlock);
	assert_spin_locked(&dlm->master_lock);

	/* remove from list if not already */
	if (!list_empty(&mle->list))
		list_del_init(&mle->list);

	/* detach the mle from the domain node up/down events */
	__dlm_mle_detach_hb_events(dlm, mle);

	/* NOTE: kfree under spinlock here.
	 * if this is bad, we can move this to a freelist. */
	kmem_cache_free(dlm_mle_cache, mle);
}


/*
 * LOCK RESOURCE FUNCTIONS
 */

static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
				  struct dlm_lock_resource *res,
				  u8 owner)
{
	assert_spin_locked(&res->spinlock);

	mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);

	if (owner == dlm->node_num)
		atomic_inc(&dlm->local_resources);
	else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
		atomic_inc(&dlm->unknown_resources);
	else
		atomic_inc(&dlm->remote_resources);

	res->owner = owner;
}

void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
			      struct dlm_lock_resource *res, u8 owner)
{
	assert_spin_locked(&res->spinlock);

	if (owner == res->owner)
		return;

	if (res->owner == dlm->node_num)
		atomic_dec(&dlm->local_resources);
	else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
		atomic_dec(&dlm->unknown_resources);
	else
		atomic_dec(&dlm->remote_resources);

	dlm_set_lockres_owner(dlm, res, owner);
}


static void dlm_lockres_release(struct kref *kref)
{
	struct dlm_lock_resource *res;

	res = container_of(kref, struct dlm_lock_resource, refs);

	/* This should not happen -- all lockres' have a name
	 * associated with them at init time. */
	BUG_ON(!res->lockname.name);

	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
	     res->lockname.name);

	/* By the time we're ready to blow this guy away, we shouldn't
	 * be on any lists. */
	BUG_ON(!hlist_unhashed(&res->hash_node));
	BUG_ON(!list_empty(&res->granted));
	BUG_ON(!list_empty(&res->converting));
	BUG_ON(!list_empty(&res->blocked));
	BUG_ON(!list_empty(&res->dirty));
	BUG_ON(!list_empty(&res->recovering));
	BUG_ON(!list_empty(&res->purge));

	kfree(res->lockname.name);

	kfree(res);
}

void dlm_lockres_get(struct dlm_lock_resource *res)
{
	kref_get(&res->refs);
}

void dlm_lockres_put(struct dlm_lock_resource *res)
{
	kref_put(&res->refs, dlm_lockres_release);
}

static void dlm_init_lockres(struct dlm_ctxt *dlm,
			     struct dlm_lock_resource *res,
			     const char *name, unsigned int namelen)
{
	char *qname;

	/* If we memset here, we lose our reference to the kmalloc'd
	 * res->lockname.name, so be sure to init every field
	 * correctly! */

	qname = (char *) res->lockname.name;
	memcpy(qname, name, namelen);

	res->lockname.len = namelen;
	res->lockname.hash = full_name_hash(name, namelen);

	init_waitqueue_head(&res->wq);
	spin_lock_init(&res->spinlock);
	INIT_HLIST_NODE(&res->hash_node);
	INIT_LIST_HEAD(&res->granted);
	INIT_LIST_HEAD(&res->converting);
	INIT_LIST_HEAD(&res->blocked);
	INIT_LIST_HEAD(&res->dirty);
	INIT_LIST_HEAD(&res->recovering);
	INIT_LIST_HEAD(&res->purge);
	atomic_set(&res->asts_reserved, 0);
	res->migration_pending = 0;

	kref_init(&res->refs);

	/* just for consistency */
	spin_lock(&res->spinlock);
	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
	spin_unlock(&res->spinlock);

	res->state = DLM_LOCK_RES_IN_PROGRESS;

	res->last_used = 0;

	memset(res->lvb, 0, DLM_LVB_LEN);
}

struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
				   const char *name,
				   unsigned int namelen)
{
	struct dlm_lock_resource *res;

	res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
	if (!res)
		return NULL;

	res->lockname.name = kmalloc(namelen, GFP_KERNEL);
	if (!res->lockname.name) {
		kfree(res);
		return NULL;
	}

	dlm_init_lockres(dlm, res, name, namelen);
	return res;
}

/*
 * lookup a lock resource by name.
 * may already exist in the hashtable.
 * lockid is null terminated
 *
 * if not, allocate enough for the lockres and for
 * the temporary structure used in doing the mastering.
 *
 * also, do a lookup in the dlm->master_list to see
 * if another node has begun mastering the same lock.
 * if so, there should be a block entry in there
 * for this name, and we should *not* attempt to master
 * the lock here.   need to wait around for that node
 * to assert_master (or die).
 *
 */
struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
					  const char *lockid,
					  int flags)
{
	struct dlm_lock_resource *tmpres=NULL, *res=NULL;
	struct dlm_master_list_entry *mle = NULL;
	struct dlm_master_list_entry *alloc_mle = NULL;
	int blocked = 0;
	int ret, nodenum;
	struct dlm_node_iter iter;
	unsigned int namelen;
	int tries = 0;
	int bit, wait_on_recovery = 0;

	BUG_ON(!lockid);

	namelen = strlen(lockid);

	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);

lookup:
	spin_lock(&dlm->spinlock);
	tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
	if (tmpres) {
		spin_unlock(&dlm->spinlock);
		mlog(0, "found in hash!\n");
		if (res)
			dlm_lockres_put(res);
		res = tmpres;
		goto leave;
	}

	if (!res) {
		spin_unlock(&dlm->spinlock);
		mlog(0, "allocating a new resource\n");
		/* nothing found and we need to allocate one. */
		alloc_mle = (struct dlm_master_list_entry *)
			kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
		if (!alloc_mle)
			goto leave;
		res = dlm_new_lockres(dlm, lockid, namelen);
		if (!res)
			goto leave;
		goto lookup;
	}

	mlog(0, "no lockres found, allocated our own: %p\n", res);

	if (flags & LKM_LOCAL) {
		/* caller knows it's safe to assume it's not mastered elsewhere
		 * DONE!  return right away */
		spin_lock(&res->spinlock);
		dlm_change_lockres_owner(dlm, res, dlm->node_num);
		__dlm_insert_lockres(dlm, res);
		spin_unlock(&res->spinlock);
		spin_unlock(&dlm->spinlock);
		/* lockres still marked IN_PROGRESS */
		goto wake_waiters;
	}

	/* check master list to see if another node has started mastering it */
	spin_lock(&dlm->master_lock);

	/* if we found a block, wait for lock to be mastered by another node */
	blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
	if (blocked) {
		if (mle->type == DLM_MLE_MASTER) {
			mlog(ML_ERROR, "master entry for nonexistent lock!\n");
			BUG();
		} else if (mle->type == DLM_MLE_MIGRATION) {
			/* migration is in progress! */
			/* the good news is that we now know the
			 * "current" master (mle->master). */

			spin_unlock(&dlm->master_lock);
			assert_spin_locked(&dlm->spinlock);

			/* set the lockres owner and hash it */
			spin_lock(&res->spinlock);
			dlm_set_lockres_owner(dlm, res, mle->master);
			__dlm_insert_lockres(dlm, res);
			spin_unlock(&res->spinlock);
			spin_unlock(&dlm->spinlock);

			/* master is known, detach */
			dlm_mle_detach_hb_events(dlm, mle);
			dlm_put_mle(mle);
			mle = NULL;
			goto wake_waiters;
		}
	} else {
		/* go ahead and try to master lock on this node */
		mle = alloc_mle;
		/* make sure this does not get freed below */
		alloc_mle = NULL;
		dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
		set_bit(dlm->node_num, mle->maybe_map);
		list_add(&mle->list, &dlm->master_list);

		/* still holding the dlm spinlock, check the recovery map
		 * to see if there are any nodes that still need to be 
		 * considered.  these will not appear in the mle nodemap
		 * but they might own this lockres.  wait on them. */
		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
		if (bit < O2NM_MAX_NODES) {
			mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
			     "recover before lock mastery can begin\n",
			     dlm->name, namelen, (char *)lockid, bit);
			wait_on_recovery = 1;
		}
	}

	/* at this point there is either a DLM_MLE_BLOCK or a
	 * DLM_MLE_MASTER on the master list, so it's safe to add the
	 * lockres to the hashtable.  anyone who finds the lock will
	 * still have to wait on the IN_PROGRESS. */

	/* finally add the lockres to its hash bucket */
	__dlm_insert_lockres(dlm, res);
	/* get an extra ref on the mle in case this is a BLOCK
	 * if so, the creator of the BLOCK may try to put the last
	 * ref at this time in the assert master handler, so we
	 * need an extra one to keep from a bad ptr deref. */
	dlm_get_mle(mle);
	spin_unlock(&dlm->master_lock);
	spin_unlock(&dlm->spinlock);

	while (wait_on_recovery) {
		/* any cluster changes that occurred after dropping the
		 * dlm spinlock would be detectable be a change on the mle,
		 * so we only need to clear out the recovery map once. */
		if (dlm_is_recovery_lock(lockid, namelen)) {
			mlog(ML_NOTICE, "%s: recovery map is not empty, but "
			     "must master $RECOVERY lock now\n", dlm->name);
			if (!dlm_pre_master_reco_lockres(dlm, res))
				wait_on_recovery = 0;
			else {
				mlog(0, "%s: waiting 500ms for heartbeat state "
				    "change\n", dlm->name);
				msleep(500);
			}
			continue;
		} 

		dlm_kick_recovery_thread(dlm);
		msleep(100);
		dlm_wait_for_recovery(dlm);

		spin_lock(&dlm->spinlock);
		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
		if (bit < O2NM_MAX_NODES) {
			mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
			     "recover before lock mastery can begin\n",
			     dlm->name, namelen, (char *)lockid, bit);
			wait_on_recovery = 1;
		} else
			wait_on_recovery = 0;
		spin_unlock(&dlm->spinlock);
	}

	/* must wait for lock to be mastered elsewhere */
	if (blocked)
		goto wait;

redo_request:
	ret = -EINVAL;
	dlm_node_iter_init(mle->vote_map, &iter);
	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
		ret = dlm_do_master_request(mle, nodenum);
		if (ret < 0)
			mlog_errno(ret);
		if (mle->master != O2NM_MAX_NODES) {
			/* found a master ! */
			if (mle->master <= nodenum)
				break;
			/* if our master request has not reached the master
			 * yet, keep going until it does.  this is how the
			 * master will know that asserts are needed back to
			 * the lower nodes. */
			mlog(0, "%s:%.*s: requests only up to %u but master "
			     "is %u, keep going\n", dlm->name, namelen,
			     lockid, nodenum, mle->master);
		}
	}

wait:
	/* keep going until the response map includes all nodes */
	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
	if (ret < 0) {
		mlog(0, "%s:%.*s: node map changed, redo the "
		     "master request now, blocked=%d\n",
		     dlm->name, res->lockname.len,
		     res->lockname.name, blocked);
		if (++tries > 20) {
			mlog(ML_ERROR, "%s:%.*s: spinning on "
			     "dlm_wait_for_lock_mastery, blocked=%d\n", 
			     dlm->name, res->lockname.len, 
			     res->lockname.name, blocked);
			dlm_print_one_lock_resource(res);
			/* dlm_print_one_mle(mle); */
			tries = 0;
		}
		goto redo_request;
	}

	mlog(0, "lockres mastered by %u\n", res->owner);
	/* make sure we never continue without this */
	BUG_ON(res->owner == O2NM_MAX_NODES);

	/* master is known, detach if not already detached */
	dlm_mle_detach_hb_events(dlm, mle);
	dlm_put_mle(mle);
	/* put the extra ref */
	dlm_put_mle(mle);

wake_waiters:
	spin_lock(&res->spinlock);
	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
	spin_unlock(&res->spinlock);
	wake_up(&res->wq);

leave:
	/* need to free the unused mle */
	if (alloc_mle)
		kmem_cache_free(dlm_mle_cache, alloc_mle);

	return res;
}


#define DLM_MASTERY_TIMEOUT_MS   5000

static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
				     struct dlm_lock_resource *res,
				     struct dlm_master_list_entry *mle,
				     int *blocked)
{
	u8 m;
	int ret, bit;
	int map_changed, voting_done;
	int assert, sleep;

recheck:
	ret = 0;
	assert = 0;

	/* check if another node has already become the owner */
	spin_lock(&res->spinlock);
	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
		mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
		     res->lockname.len, res->lockname.name, res->owner);
		spin_unlock(&res->spinlock);
		/* this will cause the master to re-assert across
		 * the whole cluster, freeing up mles */
		ret = dlm_do_master_request(mle, res->owner);
		if (ret < 0) {
			/* give recovery a chance to run */
			mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
			msleep(500);
			goto recheck;
		}
		ret = 0;
		goto leave;
	}
	spin_unlock(&res->spinlock);

	spin_lock(&mle->spinlock);
	m = mle->master;
	map_changed = (memcmp(mle->vote_map, mle->node_map,
			      sizeof(mle->vote_map)) != 0);
	voting_done = (memcmp(mle->vote_map, mle->response_map,
			     sizeof(mle->vote_map)) == 0);

	/* restart if we hit any errors */
	if (map_changed) {
		int b;
		mlog(0, "%s: %.*s: node map changed, restarting\n",
		     dlm->name, res->lockname.len, res->lockname.name);
		ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
		b = (mle->type == DLM_MLE_BLOCK);
		if ((*blocked && !b) || (!*blocked && b)) {
			mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 
			     dlm->name, res->lockname.len, res->lockname.name,
			     *blocked, b);
			*blocked = b;
		}
		spin_unlock(&mle->spinlock);
		if (ret < 0) {
			mlog_errno(ret);
			goto leave;
		}
		mlog(0, "%s:%.*s: restart lock mastery succeeded, "
		     "rechecking now\n", dlm->name, res->lockname.len,
		     res->lockname.name);
		goto recheck;
	}

	if (m != O2NM_MAX_NODES) {
		/* another node has done an assert!
		 * all done! */
		sleep = 0;
	} else {
		sleep = 1;
		/* have all nodes responded? */
		if (voting_done && !*blocked) {
			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
			if (dlm->node_num <= bit) {
				/* my node number is lowest.
			 	 * now tell other nodes that I am
				 * mastering this. */
				mle->master = dlm->node_num;
				assert = 1;
				sleep = 0;
			}
			/* if voting is done, but we have not received
			 * an assert master yet, we must sleep */
		}
	}

	spin_unlock(&mle->spinlock);

	/* sleep if we haven't finished voting yet */
	if (sleep) {
		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);

		/*
		if (atomic_read(&mle->mle_refs.refcount) < 2)
			mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
			atomic_read(&mle->mle_refs.refcount),
			res->lockname.len, res->lockname.name);
		*/
		atomic_set(&mle->woken, 0);
		(void)wait_event_timeout(mle->wq,
					 (atomic_read(&mle->woken) == 1),
					 timeo);
		if (res->owner == O2NM_MAX_NODES) {
			mlog(0, "waiting again\n");
			goto recheck;
		}
		mlog(0, "done waiting, master is %u\n", res->owner);
		ret = 0;
		goto leave;
	}

	ret = 0;   /* done */
	if (assert) {
		m = dlm->node_num;
		mlog(0, "about to master %.*s here, this=%u\n",
		     res->lockname.len, res->lockname.name, m);
		ret = dlm_do_assert_master(dlm, res->lockname.name,
					   res->lockname.len, mle->vote_map, 0);
		if (ret) {
			/* This is a failure in the network path,
			 * not in the response to the assert_master
			 * (any nonzero response is a BUG on this node).
			 * Most likely a socket just got disconnected
			 * due to node death. */
			mlog_errno(ret);
		}
		/* no longer need to restart lock mastery.
		 * all living nodes have been contacted. */
		ret = 0;
	}

	/* set the lockres owner */
	spin_lock(&res->spinlock);
	dlm_change_lockres_owner(dlm, res, m);
	spin_unlock(&res->spinlock);

leave:
	return ret;
}

struct dlm_bitmap_diff_iter
{
	int curnode;
	unsigned long *orig_bm;
	unsigned long *cur_bm;
	unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
};

enum dlm_node_state_change
{
	NODE_DOWN = -1,
	NODE_NO_CHANGE = 0,
	NODE_UP
};

static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
				      unsigned long *orig_bm,
				      unsigned long *cur_bm)
{
	unsigned long p1, p2;
	int i;

	iter->curnode = -1;
	iter->orig_bm = orig_bm;
	iter->cur_bm = cur_bm;

	for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
       		p1 = *(iter->orig_bm + i);
	       	p2 = *(iter->cur_bm + i);
		iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
	}
}

static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
				     enum dlm_node_state_change *state)
{
	int bit;

	if (iter->curnode >= O2NM_MAX_NODES)
		return -ENOENT;

	bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
			    iter->curnode+1);
	if (bit >= O2NM_MAX_NODES) {
		iter->curnode = O2NM_MAX_NODES;
		return -ENOENT;
	}

	/* if it was there in the original then this node died */
	if (test_bit(bit, iter->orig_bm))
		*state = NODE_DOWN;
	else
		*state = NODE_UP;

	iter->curnode = bit;
	return bit;
}


static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
				    struct dlm_lock_resource *res,
				    struct dlm_master_list_entry *mle,
				    int blocked)
{
	struct dlm_bitmap_diff_iter bdi;
	enum dlm_node_state_change sc;
	int node;
	int ret = 0;

	mlog(0, "something happened such that the "
	     "master process may need to be restarted!\n");

	assert_spin_locked(&mle->spinlock);

	dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
	while (node >= 0) {
		if (sc == NODE_UP) {
			/* a node came up.  clear any old vote from
			 * the response map and set it in the vote map
			 * then restart the mastery. */
			mlog(ML_NOTICE, "node %d up while restarting\n", node);

			/* redo the master request, but only for the new node */
			mlog(0, "sending request to new node\n");
			clear_bit(node, mle->response_map);
			set_bit(node, mle->vote_map);
		} else {
			mlog(ML_ERROR, "node down! %d\n", node);

			/* if the node wasn't involved in mastery skip it,
			 * but clear it out from the maps so that it will
			 * not affect mastery of this lockres */
			clear_bit(node, mle->response_map);
			clear_bit(node, mle->vote_map);
			if (!test_bit(node, mle->maybe_map))
				goto next;

			/* if we're already blocked on lock mastery, and the
			 * dead node wasn't the expected master, or there is
			 * another node in the maybe_map, keep waiting */
			if (blocked) {
				int lowest = find_next_bit(mle->maybe_map,
						       O2NM_MAX_NODES, 0);

				/* act like it was never there */
				clear_bit(node, mle->maybe_map);

			       	if (node != lowest)
					goto next;

				mlog(ML_ERROR, "expected master %u died while "
				     "this node was blocked waiting on it!\n",
				     node);
				lowest = find_next_bit(mle->maybe_map,
						       O2NM_MAX_NODES,
						       lowest+1);
				if (lowest < O2NM_MAX_NODES) {
					mlog(0, "still blocked. waiting "
					     "on %u now\n", lowest);
					goto next;
				}

				/* mle is an MLE_BLOCK, but there is now
				 * nothing left to block on.  we need to return
				 * all the way back out and try again with
				 * an MLE_MASTER. dlm_do_local_recovery_cleanup
				 * has already run, so the mle refcount is ok */
				mlog(0, "no longer blocking. we can "
				     "try to master this here\n");
				mle->type = DLM_MLE_MASTER;
				memset(mle->maybe_map, 0,
				       sizeof(mle->maybe_map));
				memset(mle->response_map, 0,
				       sizeof(mle->maybe_map));
				memcpy(mle->vote_map, mle->node_map,
				       sizeof(mle->node_map));
				mle->u.res = res;
				set_bit(dlm->node_num, mle->maybe_map);

				ret = -EAGAIN;
				goto next;
			}

			clear_bit(node, mle->maybe_map);
			if (node > dlm->node_num)
				goto next;

			mlog(0, "dead node in map!\n");
			/* yuck. go back and re-contact all nodes
			 * in the vote_map, removing this node. */
			memset(mle->response_map, 0,
			       sizeof(mle->response_map));
		}
		ret = -EAGAIN;
next:
		node = dlm_bitmap_diff_iter_next(&bdi, &sc);
	}
	return ret;
}


/*
 * DLM_MASTER_REQUEST_MSG
 *
 * returns: 0 on success,
 *          -errno on a network error
 *
 * on error, the caller should assume the target node is "dead"
 *
 */

static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to)
{
	struct dlm_ctxt *dlm = mle->dlm;
	struct dlm_master_request request;
	int ret, response=0, resend;

	memset(&request, 0, sizeof(request));
	request.node_idx = dlm->node_num;

	BUG_ON(mle->type == DLM_MLE_MIGRATION);

	if (mle->type != DLM_MLE_MASTER) {
		request.namelen = mle->u.name.len;
		memcpy(request.name, mle->u.name.name, request.namelen);
	} else {
		request.namelen = mle->u.res->lockname.len;
		memcpy(request.name, mle->u.res->lockname.name,
			request.namelen);
	}

again:
	ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
				 sizeof(request), to, &response);
	if (ret < 0)  {
		if (ret == -ESRCH) {
			/* should never happen */
			mlog(ML_ERROR, "TCP stack not ready!\n");
			BUG();
		} else if (ret == -EINVAL) {
			mlog(ML_ERROR, "bad args passed to o2net!\n");
			BUG();
		} else if (ret == -ENOMEM) {
			mlog(ML_ERROR, "out of memory while trying to send "
			     "network message!  retrying\n");
			/* this is totally crude */
			msleep(50);
			goto again;
		} else if (!dlm_is_host_down(ret)) {
			/* not a network error. bad. */
			mlog_errno(ret);
			mlog(ML_ERROR, "unhandled error!");
			BUG();
		}
		/* all other errors should be network errors,
		 * and likely indicate node death */
		mlog(ML_ERROR, "link to %d went down!\n", to);
		goto out;
	}

	ret = 0;
	resend = 0;
	spin_lock(&mle->spinlock);
	switch (response) {
		case DLM_MASTER_RESP_YES:
			set_bit(to, mle->response_map);
			mlog(0, "node %u is the master, response=YES\n", to);
			mle->master = to;
			break;
		case DLM_MASTER_RESP_NO:
			mlog(0, "node %u not master, response=NO\n", to);
			set_bit(to, mle->response_map);
			break;
		case DLM_MASTER_RESP_MAYBE:
			mlog(0, "node %u not master, response=MAYBE\n", to);
			set_bit(to, mle->response_map);
			set_bit(to, mle->maybe_map);
			break;
		case DLM_MASTER_RESP_ERROR:
			mlog(0, "node %u hit an error, resending\n", to);
			resend = 1;
			response = 0;
			break;
		default:
			mlog(ML_ERROR, "bad response! %u\n", response);
			BUG();
	}
	spin_unlock(&mle->spinlock);
	if (resend) {
		/* this is also totally crude */
		msleep(50);
		goto again;
	}

out:
	return ret;
}

/*
 * locks that can be taken here:
 * dlm->spinlock
 * res->spinlock
 * mle->spinlock
 * dlm->master_list
 *
 * if possible, TRIM THIS DOWN!!!
 */
int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
{
	u8 response = DLM_MASTER_RESP_MAYBE;
	struct dlm_ctxt *dlm = data;
	struct dlm_lock_resource *res = NULL;
	struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
	struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
	char *name;
	unsigned int namelen;
	int found, ret;
	int set_maybe;
	int dispatch_assert = 0;

	if (!dlm_grab(dlm))
		return DLM_MASTER_RESP_NO;

	if (!dlm_domain_fully_joined(dlm)) {
		response = DLM_MASTER_RESP_NO;
		goto send_response;
	}

	name = request->name;
	namelen = request->namelen;

	if (namelen > DLM_LOCKID_NAME_MAX) {
		response = DLM_IVBUFLEN;
		goto send_response;
	}

way_up_top:
	spin_lock(&dlm->spinlock);
	res = __dlm_lookup_lockres(dlm, name, namelen);
	if (res) {
		spin_unlock(&dlm->spinlock);

		/* take care of the easy cases up front */
		spin_lock(&res->spinlock);
		if (res->state & DLM_LOCK_RES_RECOVERING) {
			spin_unlock(&res->spinlock);
			mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
			     "being recovered\n");
			response = DLM_MASTER_RESP_ERROR;
			if (mle)
				kmem_cache_free(dlm_mle_cache, mle);
			goto send_response;
		}

		if (res->owner == dlm->node_num) {
			spin_unlock(&res->spinlock);
			// mlog(0, "this node is the master\n");
			response = DLM_MASTER_RESP_YES;
			if (mle)
				kmem_cache_free(dlm_mle_cache, mle);

			/* this node is the owner.
			 * there is some extra work that needs to
			 * happen now.  the requesting node has
			 * caused all nodes up to this one to
			 * create mles.  this node now needs to
			 * go back and clean those up. */
			dispatch_assert = 1;
			goto send_response;
		} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
			spin_unlock(&res->spinlock);
			// mlog(0, "node %u is the master\n", res->owner);
			response = DLM_MASTER_RESP_NO;
			if (mle)
				kmem_cache_free(dlm_mle_cache, mle);
			goto send_response;
		}

		/* ok, there is no owner.  either this node is
		 * being blocked, or it is actively trying to
		 * master this lock. */
		if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
			mlog(ML_ERROR, "lock with no owner should be "
			     "in-progress!\n");
			BUG();
		}

		// mlog(0, "lockres is in progress...\n");
		spin_lock(&dlm->master_lock);
		found = dlm_find_mle(dlm, &tmpmle, name, namelen);
		if (!found) {
			mlog(ML_ERROR, "no mle found for this lock!\n");
			BUG();
		}
		set_maybe = 1;
		spin_lock(&tmpmle->spinlock);
		if (tmpmle->type == DLM_MLE_BLOCK) {
			// mlog(0, "this node is waiting for "
			// "lockres to be mastered\n");
			response = DLM_MASTER_RESP_NO;
		} else if (tmpmle->type == DLM_MLE_MIGRATION) {
			mlog(0, "node %u is master, but trying to migrate to "
			     "node %u.\n", tmpmle->master, tmpmle->new_master);
			if (tmpmle->master == dlm->node_num) {
				response = DLM_MASTER_RESP_YES;
				mlog(ML_ERROR, "no owner on lockres, but this "
				     "node is trying to migrate it to %u?!\n",
				     tmpmle->new_master);
				BUG();
			} else {
				/* the real master can respond on its own */
				response = DLM_MASTER_RESP_NO;
			}
		} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
			set_maybe = 0;
			if (tmpmle->master == dlm->node_num) {
				response = DLM_MASTER_RESP_YES;
				/* this node will be the owner.
				 * go back and clean the mles on any
				 * other nodes */
				dispatch_assert = 1;
			} else
				response = DLM_MASTER_RESP_NO;
		} else {
			// mlog(0, "this node is attempting to "
			// "master lockres\n");
			response = DLM_MASTER_RESP_MAYBE;
		}
		if (set_maybe)
			set_bit(request->node_idx, tmpmle->maybe_map);
		spin_unlock(&tmpmle->spinlock);

		spin_unlock(&dlm->master_lock);
		spin_unlock(&res->spinlock);

		/* keep the mle attached to heartbeat events */
		dlm_put_mle(tmpmle);
		if (mle)
			kmem_cache_free(dlm_mle_cache, mle);
		goto send_response;
	}

	/*
	 * lockres doesn't exist on this node
	 * if there is an MLE_BLOCK, return NO
	 * if there is an MLE_MASTER, return MAYBE
	 * otherwise, add an MLE_BLOCK, return NO
	 */
	spin_lock(&dlm->master_lock);
	found = dlm_find_mle(dlm, &tmpmle, name, namelen);
	if (!found) {
		/* this lockid has never been seen on this node yet */
		// mlog(0, "no mle found\n");
		if (!mle) {
			spin_unlock(&dlm->master_lock);
			spin_unlock(&dlm->spinlock);

			mle = (struct dlm_master_list_entry *)
				kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
			if (!mle) {
				response = DLM_MASTER_RESP_ERROR;
				mlog_errno(-ENOMEM);
				goto send_response;
			}
			spin_lock(&dlm->spinlock);
			dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
					 name, namelen);
			spin_unlock(&dlm->spinlock);
			goto way_up_top;
		}

		// mlog(0, "this is second time thru, already allocated, "
		// "add the block.\n");
		set_bit(request->node_idx, mle->maybe_map);
		list_add(&mle->list, &dlm->master_list);
		response = DLM_MASTER_RESP_NO;
	} else {
		// mlog(0, "mle was found\n");
		set_maybe = 1;
		spin_lock(&tmpmle->spinlock);
		if (tmpmle->master == dlm->node_num) {
			mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
			BUG();
		}
		if (tmpmle->type == DLM_MLE_BLOCK)
			response = DLM_MASTER_RESP_NO;
		else if (tmpmle->type == DLM_MLE_MIGRATION) {
			mlog(0, "migration mle was found (%u->%u)\n",
			     tmpmle->master, tmpmle->new_master);
			/* real master can respond on its own */
			response = DLM_MASTER_RESP_NO;
		} else
			response = DLM_MASTER_RESP_MAYBE;
		if (set_maybe)
			set_bit(request->node_idx, tmpmle->maybe_map);
		spin_unlock(&tmpmle->spinlock);
	}
	spin_unlock(&dlm->master_lock);
	spin_unlock(&dlm->spinlock);

	if (found) {
		/* keep the mle attached to heartbeat events */
		dlm_put_mle(tmpmle);
	}
send_response:

	if (dispatch_assert) {
		if (response != DLM_MASTER_RESP_YES)
			mlog(ML_ERROR, "invalid response %d\n", response);
		if (!res) {
			mlog(ML_ERROR, "bad lockres while trying to assert!\n");
			BUG();
		}
		mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
			     dlm->node_num, res->lockname.len, res->lockname.name);
		ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, 
						 DLM_ASSERT_MASTER_MLE_CLEANUP);
		if (ret < 0) {
			mlog(ML_ERROR, "failed to dispatch assert master work\n");
			response = DLM_MASTER_RESP_ERROR;
		}
	}

	dlm_put(dlm);
	return response;
}

/*
 * DLM_ASSERT_MASTER_MSG
 */


/*
 * NOTE: this can be used for debugging
 * can periodically run all locks owned by this node
 * and re-assert across the cluster...
 */
static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
				unsigned int namelen, void *nodemap,
				u32 flags)
{
	struct dlm_assert_master assert;
	int to, tmpret;
	struct dlm_node_iter iter;
	int ret = 0;
	int reassert;

	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
again:
	reassert = 0;

	/* note that if this nodemap is empty, it returns 0 */
	dlm_node_iter_init(nodemap, &iter);
	while ((to = dlm_node_iter_next(&iter)) >= 0) {
		int r = 0;
		mlog(0, "sending assert master to %d (%.*s)\n", to,
		     namelen, lockname);
		memset(&assert, 0, sizeof(assert));
		assert.node_idx = dlm->node_num;
		assert.namelen = namelen;
		memcpy(assert.name, lockname, namelen);
		assert.flags = cpu_to_be32(flags);

		tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
					    &assert, sizeof(assert), to, &r);
		if (tmpret < 0) {
			mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
			if (!dlm_is_host_down(tmpret)) {
				mlog(ML_ERROR, "unhandled error!\n");
				BUG();
			}
			/* a node died.  finish out the rest of the nodes. */
			mlog(ML_ERROR, "link to %d went down!\n", to);
			/* any nonzero status return will do */
			ret = tmpret;
		} else if (r < 0) {
			/* ok, something horribly messed.  kill thyself. */
			mlog(ML_ERROR,"during assert master of %.*s to %u, "
			     "got %d.\n", namelen, lockname, to, r);
			dlm_dump_lock_resources(dlm);
			BUG();
		} else if (r == EAGAIN) {
			mlog(0, "%.*s: node %u create mles on other "
			     "nodes and requests a re-assert\n", 
			     namelen, lockname, to);
			reassert = 1;
		}
	}

	if (reassert)
		goto again;

	return ret;
}

/*
 * locks that can be taken here:
 * dlm->spinlock
 * res->spinlock
 * mle->spinlock
 * dlm->master_list
 *
 * if possible, TRIM THIS DOWN!!!
 */
int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
{
	struct dlm_ctxt *dlm = data;
	struct dlm_master_list_entry *mle = NULL;
	struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
	struct dlm_lock_resource *res = NULL;
	char *name;
	unsigned int namelen;
	u32 flags;
	int master_request = 0;
	int ret = 0;

	if (!dlm_grab(dlm))
		return 0;

	name = assert->name;
	namelen = assert->namelen;
	flags = be32_to_cpu(assert->flags);

	if (namelen > DLM_LOCKID_NAME_MAX) {
		mlog(ML_ERROR, "Invalid name length!");
		goto done;
	}

	spin_lock(&dlm->spinlock);

	if (flags)
		mlog(0, "assert_master with flags: %u\n", flags);

	/* find the MLE */
	spin_lock(&dlm->master_lock);
	if (!dlm_find_mle(dlm, &mle, name, namelen)) {
		/* not an error, could be master just re-asserting */
		mlog(0, "just got an assert_master from %u, but no "
		     "MLE for it! (%.*s)\n", assert->node_idx,
		     namelen, name);
	} else {
		int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
		if (bit >= O2NM_MAX_NODES) {
			/* not necessarily an error, though less likely.
			 * could be master just re-asserting. */
			mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
			     "is asserting! (%.*s)\n", assert->node_idx,
			     namelen, name);
		} else if (bit != assert->node_idx) {
			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
				mlog(0, "master %u was found, %u should "
				     "back off\n", assert->node_idx, bit);
			} else {
				/* with the fix for bug 569, a higher node
				 * number winning the mastery will respond
				 * YES to mastery requests, but this node
				 * had no way of knowing.  let it pass. */
				mlog(ML_ERROR, "%u is the lowest node, "
				     "%u is asserting. (%.*s)  %u must "
				     "have begun after %u won.\n", bit,
				     assert->node_idx, namelen, name, bit,
				     assert->node_idx);
			}
		}
	}
	spin_unlock(&dlm->master_lock);

	/* ok everything checks out with the MLE
	 * now check to see if there is a lockres */
	res = __dlm_lookup_lockres(dlm, name, namelen);
	if (res) {
		spin_lock(&res->spinlock);
		if (res->state & DLM_LOCK_RES_RECOVERING)  {
			mlog(ML_ERROR, "%u asserting but %.*s is "
			     "RECOVERING!\n", assert->node_idx, namelen, name);
			goto kill;
		}
		if (!mle) {
			if (res->owner != assert->node_idx) {
				mlog(ML_ERROR, "assert_master from "
					  "%u, but current owner is "
					  "%u! (%.*s)\n",
				       assert->node_idx, res->owner,
				       namelen, name);
				goto kill;
			}
		} else if (mle->type != DLM_MLE_MIGRATION) {
			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
				/* owner is just re-asserting */
				if (res->owner == assert->node_idx) {
					mlog(0, "owner %u re-asserting on "
					     "lock %.*s\n", assert->node_idx,
					     namelen, name);
					goto ok;
				}
				mlog(ML_ERROR, "got assert_master from "
				     "node %u, but %u is the owner! "
				     "(%.*s)\n", assert->node_idx,
				     res->owner, namelen, name);
				goto kill;
			}
			if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
				mlog(ML_ERROR, "got assert from %u, but lock "
				     "with no owner should be "
				     "in-progress! (%.*s)\n",
				     assert->node_idx,
				     namelen, name);
				goto kill;
			}
		} else /* mle->type == DLM_MLE_MIGRATION */ {
			/* should only be getting an assert from new master */
			if (assert->node_idx != mle->new_master) {
				mlog(ML_ERROR, "got assert from %u, but "
				     "new master is %u, and old master "
				     "was %u (%.*s)\n",
				     assert->node_idx, mle->new_master,
				     mle->master, namelen, name);
				goto kill;
			}

		}
ok:
		spin_unlock(&res->spinlock);
	}
	spin_unlock(&dlm->spinlock);

	// mlog(0, "woo!  got an assert_master from node %u!\n",
	// 	     assert->node_idx);
	if (mle) {
		int extra_ref = 0;
		int nn = -1;
		
		spin_lock(&mle->spinlock);
		if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
			extra_ref = 1;
		else {
			/* MASTER mle: if any bits set in the response map
			 * then the calling node needs to re-assert to clear
			 * up nodes that this node contacted */
			while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, 
						    nn+1)) < O2NM_MAX_NODES) {
				if (nn != dlm->node_num && nn != assert->node_idx)
					master_request = 1;
			}
		}
		mle->master = assert->node_idx;
		atomic_set(&mle->woken, 1);
		wake_up(&mle->wq);
		spin_unlock(&mle->spinlock);

		if (mle->type == DLM_MLE_MIGRATION && res) {
			mlog(0, "finishing off migration of lockres %.*s, "
			     "from %u to %u\n",
			       res->lockname.len, res->lockname.name,
			       dlm->node_num, mle->new_master);
			spin_lock(&res->spinlock);
			res->state &= ~DLM_LOCK_RES_MIGRATING;
			dlm_change_lockres_owner(dlm, res, mle->new_master);
			BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
			spin_unlock(&res->spinlock);
		}
		/* master is known, detach if not already detached */
		dlm_mle_detach_hb_events(dlm, mle);
		dlm_put_mle(mle);
		
		if (extra_ref) {
			/* the assert master message now balances the extra
		 	 * ref given by the master / migration request message.
		 	 * if this is the last put, it will be removed
		 	 * from the list. */
			dlm_put_mle(mle);
		}
	}

done:
	ret = 0;
	if (res)
		dlm_lockres_put(res);
	dlm_put(dlm);
	if (master_request) {
		mlog(0, "need to tell master to reassert\n");
		ret = EAGAIN;  // positive. negative would shoot down the node.
	}
	return ret;

kill:
	/* kill the caller! */
	spin_unlock(&res->spinlock);
	spin_unlock(&dlm->spinlock);
	dlm_lockres_put(res);
	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
	     "and killing the other node now!  This node is OK and can continue.\n");
	dlm_dump_lock_resources(dlm);
	dlm_put(dlm);
	return -EINVAL;
}

int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
			       struct dlm_lock_resource *res,
			       int ignore_higher, u8 request_from, u32 flags)
{
	struct dlm_work_item *item;
	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
	if (!item)
		return -ENOMEM;


	/* queue up work for dlm_assert_master_worker */
	dlm_grab(dlm);  /* get an extra ref for the work item */
	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
	item->u.am.lockres = res; /* already have a ref */
	/* can optionally ignore node numbers higher than this node */
	item->u.am.ignore_higher = ignore_higher;
	item->u.am.request_from = request_from;
	item->u.am.flags = flags;

	if (ignore_higher) 
		mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, 
		     res->lockname.name);
		
	spin_lock(&dlm->work_lock);
	list_add_tail(&item->list, &dlm->work_list);
	spin_unlock(&dlm->work_lock);

	schedule_work(&dlm->dispatched_work);
	return 0;
}

static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
{
	struct dlm_ctxt *dlm = data;
	int ret = 0;
	struct dlm_lock_resource *res;
	unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
	int ignore_higher;
	int bit;
	u8 request_from;
	u32 flags;

	dlm = item->dlm;
	res = item->u.am.lockres;
	ignore_higher = item->u.am.ignore_higher;
	request_from = item->u.am.request_from;
	flags = item->u.am.flags;

	spin_lock(&dlm->spinlock);
	memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
	spin_unlock(&dlm->spinlock);

	clear_bit(dlm->node_num, nodemap);
	if (ignore_higher) {
		/* if is this just to clear up mles for nodes below
		 * this node, do not send the message to the original
		 * caller or any node number higher than this */
		clear_bit(request_from, nodemap);
		bit = dlm->node_num;
		while (1) {
			bit = find_next_bit(nodemap, O2NM_MAX_NODES,
					    bit+1);
		       	if (bit >= O2NM_MAX_NODES)
				break;
			clear_bit(bit, nodemap);
		}
	}

	/* this call now finishes out the nodemap
	 * even if one or more nodes die */
	mlog(0, "worker about to master %.*s here, this=%u\n",
		     res->lockname.len, res->lockname.name, dlm->node_num);
	ret = dlm_do_assert_master(dlm, res->lockname.name,
				   res->lockname.len,
				   nodemap, flags);
	if (ret < 0) {
		/* no need to restart, we are done */
		mlog_errno(ret);
	}

	dlm_lockres_put(res);

	mlog(0, "finished with dlm_assert_master_worker\n");
}

/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
 * We cannot wait for node recovery to complete to begin mastering this
 * lockres because this lockres is used to kick off recovery! ;-)
 * So, do a pre-check on all living nodes to see if any of those nodes
 * think that $RECOVERY is currently mastered by a dead node.  If so,
 * we wait a short time to allow that node to get notified by its own
 * heartbeat stack, then check again.  All $RECOVERY lock resources
 * mastered by dead nodes are purged when the hearbeat callback is 
 * fired, so we can know for sure that it is safe to continue once
 * the node returns a live node or no node.  */
static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
				       struct dlm_lock_resource *res)
{
	struct dlm_node_iter iter;
	int nodenum;
	int ret = 0;
	u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;

	spin_lock(&dlm->spinlock);
	dlm_node_iter_init(dlm->domain_map, &iter);
	spin_unlock(&dlm->spinlock);

	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
		/* do not send to self */
		if (nodenum == dlm->node_num)
			continue;
		ret = dlm_do_master_requery(dlm, res, nodenum, &master);
		if (ret < 0) {
			mlog_errno(ret);
			if (!dlm_is_host_down(ret))
				BUG();
			/* host is down, so answer for that node would be
			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
		}

		if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
			/* check to see if this master is in the recovery map */
			spin_lock(&dlm->spinlock);
			if (test_bit(master, dlm->recovery_map)) {
				mlog(ML_NOTICE, "%s: node %u has not seen "
				     "node %u go down yet, and thinks the "
				     "dead node is mastering the recovery "
				     "lock.  must wait.\n", dlm->name,
				     nodenum, master);
				ret = -EAGAIN;
			}
			spin_unlock(&dlm->spinlock);
			mlog(0, "%s: reco lock master is %u\n", dlm->name, 
			     master);
			break;
		}
	}
	return ret;
}


/*
 * DLM_MIGRATE_LOCKRES
 */


int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
			u8 target)
{
	struct dlm_master_list_entry *mle = NULL;
	struct dlm_master_list_entry *oldmle = NULL;
 	struct dlm_migratable_lockres *mres = NULL;
	int ret = -EINVAL;
	const char *name;
	unsigned int namelen;
	int mle_added = 0;
	struct list_head *queue, *iter;
	int i;
	struct dlm_lock *lock;
	int empty = 1;

	if (!dlm_grab(dlm))
		return -EINVAL;

	name = res->lockname.name;
	namelen = res->lockname.len;

	mlog(0, "migrating %.*s to %u\n", namelen, name, target);

	/*
	 * ensure this lockres is a proper candidate for migration
	 */
	spin_lock(&res->spinlock);
	if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
		mlog(0, "cannot migrate lockres with unknown owner!\n");
		spin_unlock(&res->spinlock);
		goto leave;
	}
	if (res->owner != dlm->node_num) {
		mlog(0, "cannot migrate lockres this node doesn't own!\n");
		spin_unlock(&res->spinlock);
		goto leave;
	}
	mlog(0, "checking queues...\n");
	queue = &res->granted;
	for (i=0; i<3; i++) {
		list_for_each(iter, queue) {
			lock = list_entry (iter, struct dlm_lock, list);
			empty = 0;
			if (lock->ml.node == dlm->node_num) {
				mlog(0, "found a lock owned by this node "
				     "still on the %s queue!  will not "
				     "migrate this lockres\n",
				     i==0 ? "granted" :
				     (i==1 ? "converting" : "blocked"));
				spin_unlock(&res->spinlock);
				ret = -ENOTEMPTY;
				goto leave;
			}
		}
		queue++;
	}
	mlog(0, "all locks on this lockres are nonlocal.  continuing\n");
	spin_unlock(&res->spinlock);

	/* no work to do */
	if (empty) {
		mlog(0, "no locks were found on this lockres! done!\n");
		ret = 0;
		goto leave;
	}

	/*
	 * preallocate up front
	 * if this fails, abort
	 */

	ret = -ENOMEM;
	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
	if (!mres) {
		mlog_errno(ret);
		goto leave;
	}

	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
								GFP_KERNEL);
	if (!mle) {
		mlog_errno(ret);
		goto leave;
	}
	ret = 0;

	/*
	 * find a node to migrate the lockres to
	 */

	mlog(0, "picking a migration node\n");
	spin_lock(&dlm->spinlock);
	/* pick a new node */
	if (!test_bit(target, dlm->domain_map) ||
	    target >= O2NM_MAX_NODES) {
		target = dlm_pick_migration_target(dlm, res);
	}
	mlog(0, "node %u chosen for migration\n", target);

	if (target >= O2NM_MAX_NODES ||
	    !test_bit(target, dlm->domain_map)) {
		/* target chosen is not alive */
		ret = -EINVAL;
	}

	if (ret) {
		spin_unlock(&dlm->spinlock);
		goto fail;
	}

	mlog(0, "continuing with target = %u\n", target);

	/*
	 * clear any existing master requests and
	 * add the migration mle to the list
	 */
	spin_lock(&dlm->master_lock);
	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
				    namelen, target, dlm->node_num);
	spin_unlock(&dlm->master_lock);
	spin_unlock(&dlm->spinlock);

	if (ret == -EEXIST) {
		mlog(0, "another process is already migrating it\n");
		goto fail;
	}
	mle_added = 1;

	/*
	 * set the MIGRATING flag and flush asts
	 * if we fail after this we need to re-dirty the lockres
	 */
	if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
		mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
		     "the target went down.\n", res->lockname.len,
		     res->lockname.name, target);
		spin_lock(&res->spinlock);
		res->state &= ~DLM_LOCK_RES_MIGRATING;
		spin_unlock(&res->spinlock);
		ret = -EINVAL;
	}

fail:
	if (oldmle) {
		/* master is known, detach if not already detached */
		dlm_mle_detach_hb_events(dlm, oldmle);
		dlm_put_mle(oldmle);
	}

	if (ret < 0) {
		if (mle_added) {
			dlm_mle_detach_hb_events(dlm, mle);
			dlm_put_mle(mle);
		} else if (mle) {
			kmem_cache_free(dlm_mle_cache, mle);
		}
		goto leave;
	}

	/*
	 * at this point, we have a migration target, an mle
	 * in the master list, and the MIGRATING flag set on
	 * the lockres
	 */


	/* get an extra reference on the mle.
	 * otherwise the assert_master from the new
	 * master will destroy this.
	 * also, make sure that all callers of dlm_get_mle
	 * take both dlm->spinlock and dlm->master_lock */
	spin_lock(&dlm->spinlock);
	spin_lock(&dlm->master_lock);
	dlm_get_mle(mle);
	spin_unlock(&dlm->master_lock);
	spin_unlock(&dlm->spinlock);

	/* notify new node and send all lock state */
	/* call send_one_lockres with migration flag.
	 * this serves as notice to the target node that a
	 * migration is starting. */
	ret = dlm_send_one_lockres(dlm, res, mres, target,
				   DLM_MRES_MIGRATION);

	if (ret < 0) {
		mlog(0, "migration to node %u failed with %d\n",
		     target, ret);
		/* migration failed, detach and clean up mle */
		dlm_mle_detach_hb_events(dlm, mle);
		dlm_put_mle(mle);
		dlm_put_mle(mle);
		goto leave;
	}

	/* at this point, the target sends a message to all nodes,
	 * (using dlm_do_migrate_request).  this node is skipped since
	 * we had to put an mle in the list to begin the process.  this
	 * node now waits for target to do an assert master.  this node
	 * will be the last one notified, ensuring that the migration
	 * is complete everywhere.  if the target dies while this is
	 * going on, some nodes could potentially see the target as the
	 * master, so it is important that my recovery finds the migration
	 * mle and sets the master to UNKNONWN. */


	/* wait for new node to assert master */
	while (1) {
		ret = wait_event_interruptible_timeout(mle->wq,
					(atomic_read(&mle->woken) == 1),
					msecs_to_jiffies(5000));

		if (ret >= 0) {
		       	if (atomic_read(&mle->woken) == 1 ||
			    res->owner == target)
				break;

			mlog(0, "timed out during migration\n");
			/* avoid hang during shutdown when migrating lockres 
			 * to a node which also goes down */
			if (dlm_is_node_dead(dlm, target)) {
				mlog(0, "%s:%.*s: expected migration target %u "
				     "is no longer up.  restarting.\n",
				     dlm->name, res->lockname.len,
				     res->lockname.name, target);
				ret = -ERESTARTSYS;
			}
		}
		if (ret == -ERESTARTSYS) {
			/* migration failed, detach and clean up mle */
			dlm_mle_detach_hb_events(dlm, mle);
			dlm_put_mle(mle);
			dlm_put_mle(mle);
			goto leave;
		}
		/* TODO: if node died: stop, clean up, return error */
	}

	/* all done, set the owner, clear the flag */
	spin_lock(&res->spinlock);
	dlm_set_lockres_owner(dlm, res, target);
	res->state &= ~DLM_LOCK_RES_MIGRATING;
	dlm_remove_nonlocal_locks(dlm, res);
	spin_unlock(&res->spinlock);
	wake_up(&res->wq);

	/* master is known, detach if not already detached */
	dlm_mle_detach_hb_events(dlm, mle);
	dlm_put_mle(mle);
	ret = 0;

	dlm_lockres_calc_usage(dlm, res);

leave:
	/* re-dirty the lockres if we failed */
	if (ret < 0)
		dlm_kick_thread(dlm, res);

	/* TODO: cleanup */
	if (mres)
		free_page((unsigned long)mres);

	dlm_put(dlm);

	mlog(0, "returning %d\n", ret);
	return ret;
}
EXPORT_SYMBOL_GPL(dlm_migrate_lockres);

int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
	int ret;
	spin_lock(&dlm->ast_lock);
	spin_lock(&lock->spinlock);
	ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
	spin_unlock(&lock->spinlock);
	spin_unlock(&dlm->ast_lock);
	return ret;
}

static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
				     struct dlm_lock_resource *res,
				     u8 mig_target)
{
	int can_proceed;
	spin_lock(&res->spinlock);
	can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
	spin_unlock(&res->spinlock);

	/* target has died, so make the caller break out of the 
	 * wait_event, but caller must recheck the domain_map */
	spin_lock(&dlm->spinlock);
	if (!test_bit(mig_target, dlm->domain_map))
		can_proceed = 1;
	spin_unlock(&dlm->spinlock);
	return can_proceed;
}

int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
	int ret;
	spin_lock(&res->spinlock);
	ret = !!(res->state & DLM_LOCK_RES_DIRTY);
	spin_unlock(&res->spinlock);
	return ret;
}


static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
				       struct dlm_lock_resource *res,
				       u8 target)
{
	int ret = 0;

	mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
	       res->lockname.len, res->lockname.name, dlm->node_num,
	       target);
	/* need to set MIGRATING flag on lockres.  this is done by
	 * ensuring that all asts have been flushed for this lockres. */
	spin_lock(&res->spinlock);
	BUG_ON(res->migration_pending);
	res->migration_pending = 1;
	/* strategy is to reserve an extra ast then release
	 * it below, letting the release do all of the work */
	__dlm_lockres_reserve_ast(res);
	spin_unlock(&res->spinlock);

	/* now flush all the pending asts.. hang out for a bit */
	dlm_kick_thread(dlm, res);
	wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
	dlm_lockres_release_ast(dlm, res);

	mlog(0, "about to wait on migration_wq, dirty=%s\n",
	       res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
	/* if the extra ref we just put was the final one, this
	 * will pass thru immediately.  otherwise, we need to wait
	 * for the last ast to finish. */
again:
	ret = wait_event_interruptible_timeout(dlm->migration_wq,
		   dlm_migration_can_proceed(dlm, res, target),
		   msecs_to_jiffies(1000));
	if (ret < 0) {
		mlog(0, "woken again: migrating? %s, dead? %s\n",
		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
		       test_bit(target, dlm->domain_map) ? "no":"yes");
	} else {
		mlog(0, "all is well: migrating? %s, dead? %s\n",
		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
		       test_bit(target, dlm->domain_map) ? "no":"yes");
	}
	if (!dlm_migration_can_proceed(dlm, res, target)) {
		mlog(0, "trying again...\n");
		goto again;
	}

	/* did the target go down or die? */
	spin_lock(&dlm->spinlock);
	if (!test_bit(target, dlm->domain_map)) {
		mlog(ML_ERROR, "aha. migration target %u just went down\n",
		     target);
		ret = -EHOSTDOWN;
	}
	spin_unlock(&dlm->spinlock);

	/*
	 * at this point:
	 *
	 *   o the DLM_LOCK_RES_MIGRATING flag is set
	 *   o there are no pending asts on this lockres
	 *   o all processes trying to reserve an ast on this
	 *     lockres must wait for the MIGRATING flag to clear
	 */
	return ret;
}

/* last step in the migration process.
 * original master calls this to free all of the dlm_lock
 * structures that used to be for other nodes. */
static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
				      struct dlm_lock_resource *res)
{
	struct list_head *iter, *iter2;
	struct list_head *queue = &res->granted;
	int i;
	struct dlm_lock *lock;

	assert_spin_locked(&res->spinlock);

	BUG_ON(res->owner == dlm->node_num);

	for (i=0; i<3; i++) {
		list_for_each_safe(iter, iter2, queue) {
			lock = list_entry (iter, struct dlm_lock, list);
			if (lock->ml.node != dlm->node_num) {
				mlog(0, "putting lock for node %u\n",
				     lock->ml.node);
				/* be extra careful */
				BUG_ON(!list_empty(&lock->ast_list));
				BUG_ON(!list_empty(&lock->bast_list));
				BUG_ON(lock->ast_pending);
				BUG_ON(lock->bast_pending);
				list_del_init(&lock->list);
				dlm_lock_put(lock);
			}
		}
		queue++;
	}
}

/* for now this is not too intelligent.  we will
 * need stats to make this do the right thing.
 * this just finds the first lock on one of the
 * queues and uses that node as the target. */
static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
				    struct dlm_lock_resource *res)
{
	int i;
	struct list_head *queue = &res->granted;
	struct list_head *iter;
	struct dlm_lock *lock;
	int nodenum;

	assert_spin_locked(&dlm->spinlock);

	spin_lock(&res->spinlock);
	for (i=0; i<3; i++) {
		list_for_each(iter, queue) {
			/* up to the caller to make sure this node
			 * is alive */
			lock = list_entry (iter, struct dlm_lock, list);
			if (lock->ml.node != dlm->node_num) {
				spin_unlock(&res->spinlock);
				return lock->ml.node;
			}
		}
		queue++;
	}
	spin_unlock(&res->spinlock);
	mlog(0, "have not found a suitable target yet! checking domain map\n");

	/* ok now we're getting desperate.  pick anyone alive. */
	nodenum = -1;
	while (1) {
		nodenum = find_next_bit(dlm->domain_map,
					O2NM_MAX_NODES, nodenum+1);
		mlog(0, "found %d in domain map\n", nodenum);
		if (nodenum >= O2NM_MAX_NODES)
			break;
		if (nodenum != dlm->node_num) {
			mlog(0, "picking %d\n", nodenum);
			return nodenum;
		}
	}

	mlog(0, "giving up.  no master to migrate to\n");
	return DLM_LOCK_RES_OWNER_UNKNOWN;
}



/* this is called by the new master once all lockres
 * data has been received */
static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
				  struct dlm_lock_resource *res,
				  u8 master, u8 new_master,
				  struct dlm_node_iter *iter)
{
	struct dlm_migrate_request migrate;
	int ret, status = 0;
	int nodenum;

	memset(&migrate, 0, sizeof(migrate));
	migrate.namelen = res->lockname.len;
	memcpy(migrate.name, res->lockname.name, migrate.namelen);
	migrate.new_master = new_master;
	migrate.master = master;

	ret = 0;

	/* send message to all nodes, except the master and myself */
	while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
		if (nodenum == master ||
		    nodenum == new_master)
			continue;

		ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
					 &migrate, sizeof(migrate), nodenum,
					 &status);
		if (ret < 0)
			mlog_errno(ret);
		else if (status < 0) {
			mlog(0, "migrate request (node %u) returned %d!\n",
			     nodenum, status);
			ret = status;
		}
	}

	if (ret < 0)
		mlog_errno(ret);

	mlog(0, "returning ret=%d\n", ret);
	return ret;
}


/* if there is an existing mle for this lockres, we now know who the master is.
 * (the one who sent us *this* message) we can clear it up right away.
 * since the process that put the mle on the list still has a reference to it,
 * we can unhash it now, set the master and wake the process.  as a result,
 * we will have no mle in the list to start with.  now we can add an mle for
 * the migration and this should be the only one found for those scanning the
 * list.  */
int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
{
	struct dlm_ctxt *dlm = data;
	struct dlm_lock_resource *res = NULL;
	struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
	struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
	const char *name;
	unsigned int namelen;
	int ret = 0;

	if (!dlm_grab(dlm))
		return -EINVAL;

	name = migrate->name;
	namelen = migrate->namelen;

	/* preallocate.. if this fails, abort */
	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
							 GFP_KERNEL);

	if (!mle) {
		ret = -ENOMEM;
		goto leave;
	}

	/* check for pre-existing lock */
	spin_lock(&dlm->spinlock);
	res = __dlm_lookup_lockres(dlm, name, namelen);
	spin_lock(&dlm->master_lock);

	if (res) {
		spin_lock(&res->spinlock);
		if (res->state & DLM_LOCK_RES_RECOVERING) {
			/* if all is working ok, this can only mean that we got
		 	* a migrate request from a node that we now see as
		 	* dead.  what can we do here?  drop it to the floor? */
			spin_unlock(&res->spinlock);
			mlog(ML_ERROR, "Got a migrate request, but the "
			     "lockres is marked as recovering!");
			kmem_cache_free(dlm_mle_cache, mle);
			ret = -EINVAL; /* need a better solution */
			goto unlock;
		}
		res->state |= DLM_LOCK_RES_MIGRATING;
		spin_unlock(&res->spinlock);
	}

	/* ignore status.  only nonzero status would BUG. */
	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
				    name, namelen,
				    migrate->new_master,
				    migrate->master);

unlock:
	spin_unlock(&dlm->master_lock);
	spin_unlock(&dlm->spinlock);

	if (oldmle) {
		/* master is known, detach if not already detached */
		dlm_mle_detach_hb_events(dlm, oldmle);
		dlm_put_mle(oldmle);
	}

	if (res)
		dlm_lockres_put(res);
leave:
	dlm_put(dlm);
	return ret;
}

/* must be holding dlm->spinlock and dlm->master_lock
 * when adding a migration mle, we can clear any other mles
 * in the master list because we know with certainty that
 * the master is "master".  so we remove any old mle from
 * the list after setting it's master field, and then add
 * the new migration mle.  this way we can hold with the rule
 * of having only one mle for a given lock name at all times. */
static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
				 struct dlm_lock_resource *res,
				 struct dlm_master_list_entry *mle,
				 struct dlm_master_list_entry **oldmle,
				 const char *name, unsigned int namelen,
				 u8 new_master, u8 master)
{
	int found;
	int ret = 0;

	*oldmle = NULL;

	mlog_entry_void();

	assert_spin_locked(&dlm->spinlock);
	assert_spin_locked(&dlm->master_lock);

	/* caller is responsible for any ref taken here on oldmle */
	found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
	if (found) {
		struct dlm_master_list_entry *tmp = *oldmle;
		spin_lock(&tmp->spinlock);
		if (tmp->type == DLM_MLE_MIGRATION) {
			if (master == dlm->node_num) {
				/* ah another process raced me to it */
				mlog(0, "tried to migrate %.*s, but some "
				     "process beat me to it\n",
				     namelen, name);
				ret = -EEXIST;
			} else {
				/* bad.  2 NODES are trying to migrate! */
				mlog(ML_ERROR, "migration error  mle: "
				     "master=%u new_master=%u // request: "
				     "master=%u new_master=%u // "
				     "lockres=%.*s\n",
				     tmp->master, tmp->new_master,
				     master, new_master,
				     namelen, name);
				BUG();
			}
		} else {
			/* this is essentially what assert_master does */
			tmp->master = master;
			atomic_set(&tmp->woken, 1);
			wake_up(&tmp->wq);
			/* remove it from the list so that only one
			 * mle will be found */
			list_del_init(&tmp->list);
		}
		spin_unlock(&tmp->spinlock);
	}

	/* now add a migration mle to the tail of the list */
	dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
	mle->new_master = new_master;
	mle->master = master;
	/* do this for consistency with other mle types */
	set_bit(new_master, mle->maybe_map);
	list_add(&mle->list, &dlm->master_list);

	return ret;
}


void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
{
	struct list_head *iter, *iter2;
	struct dlm_master_list_entry *mle;
	struct dlm_lock_resource *res;

	mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
top:
	assert_spin_locked(&dlm->spinlock);

	/* clean the master list */
	spin_lock(&dlm->master_lock);
	list_for_each_safe(iter, iter2, &dlm->master_list) {
		mle = list_entry(iter, struct dlm_master_list_entry, list);

		BUG_ON(mle->type != DLM_MLE_BLOCK &&
		       mle->type != DLM_MLE_MASTER &&
		       mle->type != DLM_MLE_MIGRATION);

		/* MASTER mles are initiated locally.  the waiting
		 * process will notice the node map change
		 * shortly.  let that happen as normal. */
		if (mle->type == DLM_MLE_MASTER)
			continue;


		/* BLOCK mles are initiated by other nodes.
		 * need to clean up if the dead node would have
		 * been the master. */
		if (mle->type == DLM_MLE_BLOCK) {
			int bit;

			spin_lock(&mle->spinlock);
			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
			if (bit != dead_node) {
				mlog(0, "mle found, but dead node %u would "
				     "not have been master\n", dead_node);
				spin_unlock(&mle->spinlock);
			} else {
				/* must drop the refcount by one since the
				 * assert_master will never arrive.  this
				 * may result in the mle being unlinked and
				 * freed, but there may still be a process
				 * waiting in the dlmlock path which is fine. */
				mlog(ML_ERROR, "node %u was expected master\n",
				     dead_node);
				atomic_set(&mle->woken, 1);
				spin_unlock(&mle->spinlock);
				wake_up(&mle->wq);
				/* do not need events any longer, so detach 
				 * from heartbeat */
				__dlm_mle_detach_hb_events(dlm, mle);
				__dlm_put_mle(mle);
			}
			continue;
		}

		/* everything else is a MIGRATION mle */

		/* the rule for MIGRATION mles is that the master
		 * becomes UNKNOWN if *either* the original or
		 * the new master dies.  all UNKNOWN lockreses
		 * are sent to whichever node becomes the recovery
		 * master.  the new master is responsible for
		 * determining if there is still a master for
		 * this lockres, or if he needs to take over
		 * mastery.  either way, this node should expect
		 * another message to resolve this. */
		if (mle->master != dead_node &&
		    mle->new_master != dead_node)
			continue;

		/* if we have reached this point, this mle needs to
		 * be removed from the list and freed. */

		/* remove from the list early.  NOTE: unlinking
		 * list_head while in list_for_each_safe */
		spin_lock(&mle->spinlock);
		list_del_init(&mle->list);
		atomic_set(&mle->woken, 1);
		spin_unlock(&mle->spinlock);
		wake_up(&mle->wq);

		mlog(0, "node %u died during migration from "
		     "%u to %u!\n", dead_node,
		     mle->master, mle->new_master);
		/* if there is a lockres associated with this
	 	 * mle, find it and set its owner to UNKNOWN */
		res = __dlm_lookup_lockres(dlm, mle->u.name.name,
					mle->u.name.len);
		if (res) {
			/* unfortunately if we hit this rare case, our
		 	 * lock ordering is messed.  we need to drop
		 	 * the master lock so that we can take the
		  	 * lockres lock, meaning that we will have to
			 * restart from the head of list. */
			spin_unlock(&dlm->master_lock);

			/* move lockres onto recovery list */
			spin_lock(&res->spinlock);
			dlm_set_lockres_owner(dlm, res,
				      	DLM_LOCK_RES_OWNER_UNKNOWN);
			dlm_move_lockres_to_recovery_list(dlm, res);
			spin_unlock(&res->spinlock);
			dlm_lockres_put(res);

			/* about to get rid of mle, detach from heartbeat */
			__dlm_mle_detach_hb_events(dlm, mle);

			/* dump the mle */
			spin_lock(&dlm->master_lock);
			__dlm_put_mle(mle);
			spin_unlock(&dlm->master_lock);

			/* restart */
			goto top;
		}

		/* this may be the last reference */
		__dlm_put_mle(mle);
	}
	spin_unlock(&dlm->master_lock);
}


int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
			 u8 old_master)
{
	struct dlm_node_iter iter;
	int ret = 0;

	spin_lock(&dlm->spinlock);
	dlm_node_iter_init(dlm->domain_map, &iter);
	clear_bit(old_master, iter.node_map);
	clear_bit(dlm->node_num, iter.node_map);
	spin_unlock(&dlm->spinlock);

	mlog(0, "now time to do a migrate request to other nodes\n");
	ret = dlm_do_migrate_request(dlm, res, old_master,
				     dlm->node_num, &iter);
	if (ret < 0) {
		mlog_errno(ret);
		goto leave;
	}

	mlog(0, "doing assert master of %.*s to all except the original node\n",
	     res->lockname.len, res->lockname.name);
	/* this call now finishes out the nodemap
	 * even if one or more nodes die */
	ret = dlm_do_assert_master(dlm, res->lockname.name,
				   res->lockname.len, iter.node_map,
				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
	if (ret < 0) {
		/* no longer need to retry.  all living nodes contacted. */
		mlog_errno(ret);
		ret = 0;
	}

	memset(iter.node_map, 0, sizeof(iter.node_map));
	set_bit(old_master, iter.node_map);
	mlog(0, "doing assert master of %.*s back to %u\n",
	     res->lockname.len, res->lockname.name, old_master);
	ret = dlm_do_assert_master(dlm, res->lockname.name,
				   res->lockname.len, iter.node_map,
				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
	if (ret < 0) {
		mlog(0, "assert master to original master failed "
		     "with %d.\n", ret);
		/* the only nonzero status here would be because of
		 * a dead original node.  we're done. */
		ret = 0;
	}

	/* all done, set the owner, clear the flag */
	spin_lock(&res->spinlock);
	dlm_set_lockres_owner(dlm, res, dlm->node_num);
	res->state &= ~DLM_LOCK_RES_MIGRATING;
	spin_unlock(&res->spinlock);
	/* re-dirty it on the new master */
	dlm_kick_thread(dlm, res);
	wake_up(&res->wq);
leave:
	return ret;
}

/*
 * LOCKRES AST REFCOUNT
 * this is integral to migration
 */

/* for future intent to call an ast, reserve one ahead of time.
 * this should be called only after waiting on the lockres
 * with dlm_wait_on_lockres, and while still holding the
 * spinlock after the call. */
void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
{
	assert_spin_locked(&res->spinlock);
	if (res->state & DLM_LOCK_RES_MIGRATING) {
		__dlm_print_one_lock_resource(res);
	}
	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);

	atomic_inc(&res->asts_reserved);
}

/*
 * used to drop the reserved ast, either because it went unused,
 * or because the ast/bast was actually called.
 *
 * also, if there is a pending migration on this lockres,
 * and this was the last pending ast on the lockres,
 * atomically set the MIGRATING flag before we drop the lock.
 * this is how we ensure that migration can proceed with no
 * asts in progress.  note that it is ok if the state of the
 * queues is such that a lock should be granted in the future
 * or that a bast should be fired, because the new master will
 * shuffle the lists on this lockres as soon as it is migrated.
 */
void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
			     struct dlm_lock_resource *res)
{
	if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
		return;

	if (!res->migration_pending) {
		spin_unlock(&res->spinlock);
		return;
	}

	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
	res->migration_pending = 0;
	res->state |= DLM_LOCK_RES_MIGRATING;
	spin_unlock(&res->spinlock);
	wake_up(&res->wq);
	wake_up(&dlm->migration_wq);
}