workqueue: reimplement idle worker rebinding

Currently rebind_workers() uses rebinds idle workers synchronously before proceeding to requesting busy workers to rebind. This is necessary because all workers on @worker_pool->idle_list must be bound before concurrency management local wake-ups from the busy workers take place. Unfortunately, the synchronous idle rebinding is quite complicated. This patch reimplements idle rebinding to simplify the code path. Rather than trying to make all idle workers bound before rebinding busy workers, we simply remove all to-be-bound idle workers from the idle list and let them add themselves back after completing rebinding (successful or not). As only workers which finished rebinding can on on the idle worker list, the idle worker list is guaranteed to have only bound workers unless CPU went down again and local wake-ups are safe. After the change, @worker_pool->nr_idle may deviate than the actual number of idle workers on @worker_pool->idle_list. More specifically, nr_idle may be non-zero while ->idle_list is empty. All users of ->nr_idle and ->idle_list are audited. The only affected one is too_many_workers() which is updated to check %false if ->idle_list is empty regardless of ->nr_idle. After this patch, rebind_workers() no longer performs the nasty idle-rebind retries which require temporary release of gcwq->lock, and both unbinding and rebinding are atomic w.r.t. global_cwq->lock. worker->idle_rebind and global_cwq->rebind_hold are now unnecessary and removed along with the definition of struct idle_rebind. Changed from V1: 1) remove unlikely from too_many_workers(), ->idle_list can be empty anytime, even before this patch, no reason to use unlikely. 2) fix a small rebasing mistake. (which is from rebasing the orignal fixing patch to for-next) 3) add a lot of comments. 4) clear WORKER_REBIND unconditionaly in idle_worker_rebind() tj: Updated comments and description. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org>
author: Lai Jiangshan 2012-09-18 18:59:22 +0200
committer: Tejun Heo 2012-09-18 18:59:22 +0200
commit: ea1abd6197d5805655da1bb589929762f4b4aa08 (patch)
tree: 6ba4ac400e9243622558b852583d1cdf3ef61b1c /kernel/workqueue.c
parent: Merge branch 'for-3.6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
download: kernel-qcow2-linux-ea1abd6197d5805655da1bb589929762f4b4aa08.tar.gz
kernel-qcow2-linux-ea1abd6197d5805655da1bb589929762f4b4aa08.tar.xz
kernel-qcow2-linux-ea1abd6197d5805655da1bb589929762f4b4aa08.zip
1 files changed, 42 insertions, 99 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 31d8a4586d4c..770c1a8128bf 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -126,7 +126,6 @@ enum {
 
 struct global_cwq;
 struct worker_pool;
-struct idle_rebind;
 
 /*
  * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -150,7 +149,6 @@ struct worker {
 	int			id;		/* I: worker id */
 
 	/* for rebinding worker to CPU */
-	struct idle_rebind	*idle_rebind;	/* L: for idle worker */
 	struct work_struct	rebind_work;	/* L: for busy worker */
 };
 
@@ -160,6 +158,8 @@ struct worker_pool {
 
 	struct list_head	worklist;	/* L: list of pending works */
 	int			nr_workers;	/* L: total number of workers */
+
+	/* nr_idle includes the ones off idle_list for rebinding */
 	int			nr_idle;	/* L: currently idle ones */
 
 	struct list_head	idle_list;	/* X: list of idle workers */
@@ -186,8 +186,6 @@ struct global_cwq {
 
 	struct worker_pool	pools[NR_WORKER_POOLS];
 						/* normal and highpri pools */
-
-	wait_queue_head_t	rebind_hold;	/* rebind hold wait */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -687,6 +685,13 @@ static bool too_many_workers(struct worker_pool *pool)
 	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
 	int nr_busy = pool->nr_workers - nr_idle;
 
+	/*
+	 * nr_idle and idle_list may disagree if idle rebinding is in
+	 * progress.  Never return %true if idle_list is empty.
+	 */
+	if (list_empty(&pool->idle_list))
+		return false;
+
 	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
 
@@ -1611,37 +1616,26 @@ __acquires(&gcwq->lock)
 	}
 }
 
-struct idle_rebind {
-	int			cnt;		/* # workers to be rebound */
-	struct completion	done;		/* all workers rebound */
-};
-
 /*
- * Rebind an idle @worker to its CPU.  During CPU onlining, this has to
- * happen synchronously for idle workers.  worker_thread() will test
+ * Rebind an idle @worker to its CPU.  worker_thread() will test
  * %WORKER_REBIND before leaving idle and call this function.
  */
 static void idle_worker_rebind(struct worker *worker)
 {
 	struct global_cwq *gcwq = worker->pool->gcwq;
 
-	/* CPU must be online at this point */
-	WARN_ON(!worker_maybe_bind_and_lock(worker));
-	if (!--worker->idle_rebind->cnt)
-		complete(&worker->idle_rebind->done);
-	spin_unlock_irq(&worker->pool->gcwq->lock);
-
-	/* we did our part, wait for rebind_workers() to finish up */
-	wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
-
 	/*
-	 * rebind_workers() shouldn't finish until all workers passed the
-	 * above WORKER_REBIND wait.  Tell it when done.
+	 * CPU may go down again inbetween.  If rebinding fails, reinstate
+	 * UNBOUND.  We're off idle_list and nobody else can do it for us.
 	 */
-	spin_lock_irq(&worker->pool->gcwq->lock);
-	if (!--worker->idle_rebind->cnt)
-		complete(&worker->idle_rebind->done);
-	spin_unlock_irq(&worker->pool->gcwq->lock);
+	if (!worker_maybe_bind_and_lock(worker))
+		worker->flags |= WORKER_UNBOUND;
+
+	worker_clr_flags(worker, WORKER_REBIND);
+
+	/* rebind complete, become available again */
+	list_add(&worker->entry, &worker->pool->idle_list);
+	spin_unlock_irq(&gcwq->lock);
 }
 
 /*
@@ -1676,29 +1670,25 @@ static void busy_worker_rebind_fn(struct work_struct *work)
  * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
  * is different for idle and busy ones.
  *
- * The idle ones should be rebound synchronously and idle rebinding should
- * be complete before any worker starts executing work items with
- * concurrency management enabled; otherwise, scheduler may oops trying to
- * wake up non-local idle worker from wq_worker_sleeping().
- *
- * This is achieved by repeatedly requesting rebinding until all idle
- * workers are known to have been rebound under @gcwq->lock and holding all
- * idle workers from becoming busy until idle rebinding is complete.
+ * Idle ones will be removed from the idle_list and woken up.  They will
+ * add themselves back after completing rebind.  This ensures that the
+ * idle_list doesn't contain any unbound workers when re-bound busy workers
+ * try to perform local wake-ups for concurrency management.
  *
- * Once idle workers are rebound, busy workers can be rebound as they
- * finish executing their current work items.  Queueing the rebind work at
- * the head of their scheduled lists is enough.  Note that nr_running will
- * be properbly bumped as busy workers rebind.
+ * Busy workers can rebind after they finish their current work items.
+ * Queueing the rebind work item at the head of the scheduled list is
+ * enough.  Note that nr_running will be properly bumped as busy workers
+ * rebind.
  *
- * On return, all workers are guaranteed to either be bound or have rebind
- * work item scheduled.
+ * On return, all non-manager workers are scheduled for rebind - see
+ * manage_workers() for the manager special case.  Any idle worker
+ * including the manager will not appear on @idle_list until rebind is
+ * complete, making local wake-ups safe.
  */
 static void rebind_workers(struct global_cwq *gcwq)
-	__releases(&gcwq->lock) __acquires(&gcwq->lock)
 {
-	struct idle_rebind idle_rebind;
 	struct worker_pool *pool;
-	struct worker *worker;
+	struct worker *worker, *n;
 	struct hlist_node *pos;
 	int i;
 
@@ -1707,46 +1697,29 @@ static void rebind_workers(struct global_cwq *gcwq)
 	for_each_worker_pool(pool, gcwq)
 		lockdep_assert_held(&pool->manager_mutex);
 
-	/*
-	 * Rebind idle workers.  Interlocked both ways.  We wait for
-	 * workers to rebind via @idle_rebind.done.  Workers will wait for
-	 * us to finish up by watching %WORKER_REBIND.
-	 */
-	init_completion(&idle_rebind.done);
-retry:
-	idle_rebind.cnt = 1;
-	INIT_COMPLETION(idle_rebind.done);
-
-	/* set REBIND and kick idle ones, we'll wait for these later */
+	/* set REBIND and kick idle ones */
 	for_each_worker_pool(pool, gcwq) {
-		list_for_each_entry(worker, &pool->idle_list, entry) {
+		list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
 			unsigned long worker_flags = worker->flags;
 
-			if (worker->flags & WORKER_REBIND)
-				continue;
-
 			/* morph UNBOUND to REBIND atomically */
 			worker_flags &= ~WORKER_UNBOUND;
 			worker_flags |= WORKER_REBIND;
 			ACCESS_ONCE(worker->flags) = worker_flags;
 
-			idle_rebind.cnt++;
-			worker->idle_rebind = &idle_rebind;
+			/*
+			 * idle workers should be off @pool->idle_list
+			 * until rebind is complete to avoid receiving
+			 * premature local wake-ups.
+			 */
+			list_del_init(&worker->entry);
 
 			/* worker_thread() will call idle_worker_rebind() */
 			wake_up_process(worker->task);
 		}
 	}
 
-	if (--idle_rebind.cnt) {
-		spin_unlock_irq(&gcwq->lock);
-		wait_for_completion(&idle_rebind.done);
-		spin_lock_irq(&gcwq->lock);
-		/* busy ones might have become idle while waiting, retry */
-		goto retry;
-	}
-
-	/* all idle workers are rebound, rebind busy workers */
+	/* rebind busy workers */
 	for_each_busy_worker(worker, i, pos, gcwq) {
 		unsigned long worker_flags = worker->flags;
 		struct work_struct *rebind_work = &worker->rebind_work;
@@ -1776,34 +1749,6 @@ retry:
 			worker->scheduled.next,
 			work_color_to_flags(WORK_NO_COLOR));
 	}
-
-	/*
-	 * All idle workers are rebound and waiting for %WORKER_REBIND to
-	 * be cleared inside idle_worker_rebind().  Clear and release.
-	 * Clearing %WORKER_REBIND from this foreign context is safe
-	 * because these workers are still guaranteed to be idle.
-	 *
-	 * We need to make sure all idle workers passed WORKER_REBIND wait
-	 * in idle_worker_rebind() before returning; otherwise, workers can
-	 * get stuck at the wait if hotplug cycle repeats.
-	 */
-	idle_rebind.cnt = 1;
-	INIT_COMPLETION(idle_rebind.done);
-
-	for_each_worker_pool(pool, gcwq) {
-		list_for_each_entry(worker, &pool->idle_list, entry) {
-			worker->flags &= ~WORKER_REBIND;
-			idle_rebind.cnt++;
-		}
-	}
-
-	wake_up_all(&gcwq->rebind_hold);
-
-	if (--idle_rebind.cnt) {
-		spin_unlock_irq(&gcwq->lock);
-		wait_for_completion(&idle_rebind.done);
-		spin_lock_irq(&gcwq->lock);
-	}
 }
 
 static struct worker *alloc_worker(void)
@@ -3916,8 +3861,6 @@ static int __init init_workqueues(void)
 			mutex_init(&pool->manager_mutex);
 			ida_init(&pool->worker_ida);
 		}
-
-		init_waitqueue_head(&gcwq->rebind_hold);
 	}
 
 	/* create the initial worker */
author	Lai Jiangshan	2012-09-18 18:59:22 +0200
committer	Tejun Heo	2012-09-18 18:59:22 +0200
commit	ea1abd6197d5805655da1bb589929762f4b4aa08 (patch)
tree	6ba4ac400e9243622558b852583d1cdf3ef61b1c /kernel/workqueue.c
parent	Merge branch 'for-3.6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
download	kernel-qcow2-linux-ea1abd6197d5805655da1bb589929762f4b4aa08.tar.gz kernel-qcow2-linux-ea1abd6197d5805655da1bb589929762f4b4aa08.tar.xz kernel-qcow2-linux-ea1abd6197d5805655da1bb589929762f4b4aa08.zip