6900839 taskq_ent_alloc: TQ_SLEEP performance cliff when tq_nalloc > tq_maxalloc
authorChris Horne <Chris.Horne@Sun.COM>
Thu, 04 Mar 2010 12:58:34 -0700
changeset 11854 5351ddd19d45
parent 11853 2adca8ef2c23
child 11855 606f75cdda48
6900839 taskq_ent_alloc: TQ_SLEEP performance cliff when tq_nalloc > tq_maxalloc 6928985 taskq filled with taskq_bucket_extend calls will not help 6928184 Anago: On takeover, one node fails to rejoin cluster, appliance kit won't start on the other node
usr/src/lib/libzpool/common/taskq.c
usr/src/uts/common/os/damap.c
usr/src/uts/common/os/taskq.c
usr/src/uts/common/sys/taskq_impl.h
--- a/usr/src/lib/libzpool/common/taskq.c	Thu Mar 04 10:02:10 2010 -0800
+++ b/usr/src/lib/libzpool/common/taskq.c	Thu Mar 04 12:58:34 2010 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -49,6 +49,8 @@
 	int		tq_nalloc;
 	int		tq_minalloc;
 	int		tq_maxalloc;
+	kcondvar_t	tq_maxalloc_cv;
+	int		tq_maxalloc_wait;
 	task_t		*tq_freelist;
 	task_t		tq_task;
 };
@@ -57,26 +59,36 @@
 task_alloc(taskq_t *tq, int tqflags)
 {
 	task_t *t;
+	int rv;
 
-	if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
+again:	if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
 		tq->tq_freelist = t->task_next;
 	} else {
-		mutex_exit(&tq->tq_lock);
 		if (tq->tq_nalloc >= tq->tq_maxalloc) {
-			if (!(tqflags & KM_SLEEP)) {
-				mutex_enter(&tq->tq_lock);
+			if (!(tqflags & KM_SLEEP))
 				return (NULL);
-			}
+
 			/*
 			 * We don't want to exceed tq_maxalloc, but we can't
 			 * wait for other tasks to complete (and thus free up
 			 * task structures) without risking deadlock with
 			 * the caller.  So, we just delay for one second
-			 * to throttle the allocation rate.
+			 * to throttle the allocation rate. If we have tasks
+			 * complete before one second timeout expires then
+			 * taskq_ent_free will signal us and we will
+			 * immediately retry the allocation.
 			 */
-			delay(hz);
+			tq->tq_maxalloc_wait++;
+			rv = cv_timedwait(&tq->tq_maxalloc_cv,
+			    &tq->tq_lock, ddi_get_lbolt() + hz);
+			tq->tq_maxalloc_wait--;
+			if (rv > 0)
+				goto again;		/* signaled */
 		}
+		mutex_exit(&tq->tq_lock);
+
 		t = kmem_alloc(sizeof (task_t), tqflags);
+
 		mutex_enter(&tq->tq_lock);
 		if (t != NULL)
 			tq->tq_nalloc++;
@@ -96,6 +108,9 @@
 		kmem_free(t, sizeof (task_t));
 		mutex_enter(&tq->tq_lock);
 	}
+
+	if (tq->tq_maxalloc_wait)
+		cv_signal(&tq->tq_maxalloc_cv);
 }
 
 taskqid_t
@@ -196,6 +211,7 @@
 	mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL);
 	tq->tq_flags = flags | TASKQ_ACTIVE;
 	tq->tq_active = nthreads;
 	tq->tq_nthreads = nthreads;
@@ -252,6 +268,7 @@
 	mutex_destroy(&tq->tq_lock);
 	cv_destroy(&tq->tq_dispatch_cv);
 	cv_destroy(&tq->tq_wait_cv);
+	cv_destroy(&tq->tq_maxalloc_cv);
 
 	kmem_free(tq, sizeof (taskq_t));
 }
--- a/usr/src/uts/common/os/damap.c	Thu Mar 04 10:02:10 2010 -0800
+++ b/usr/src/uts/common/os/damap.c	Thu Mar 04 12:58:34 2010 -0700
@@ -84,6 +84,8 @@
  */
 #define	DAM_SIZE_BUMP	64
 
+int	damap_taskq_dispatch_retry_usec = 1000;
+
 /*
  * config/unconfig taskq data
  */
@@ -1052,7 +1054,7 @@
 				tqd->tqd_mapp = mapp;
 				tqd->tqd_id = i;
 				(void) taskq_dispatch(tqp, dam_tq_config,
-				    tqd, KM_SLEEP);
+				    tqd, TQ_SLEEP);
 			}
 		}
 	}
@@ -1097,7 +1099,7 @@
 				tqd->tqd_mapp = mapp;
 				tqd->tqd_id = i;
 				(void) taskq_dispatch(tqp,
-				    dam_tq_unconfig, tqd, KM_SLEEP);
+				    dam_tq_unconfig, tqd, TQ_SLEEP);
 			}
 		}
 	}
@@ -1330,12 +1332,21 @@
 	 */
 	if (spend) {
 		if (taskq_dispatch(system_taskq, dam_stabilize_map,
-		    mapp, TQ_NOSLEEP)) {
+		    mapp, TQ_NOSLEEP | TQ_NOQUEUE)) {
 			mapp->dam_flags  |= DAM_SPEND;
 			DTRACE_PROBE2(damap__map__addr__stable__start, char *,
 			    mapp->dam_name, dam_t *, mapp);
 		} else {
 			tpend++;
+
+			/*
+			 * Avoid waiting the entire stabalization
+			 * time again if taskq_diskpatch fails.
+			 */
+			tmo_delta = drv_usectohz(
+			    damap_taskq_dispatch_retry_usec);
+			if (tmo_delta < next_tmov)
+				next_tmov = tmo_delta;
 		}
 	}
 
@@ -1371,11 +1382,15 @@
 	 * else dispatch the task to configure the new stable set of
 	 * addresses.
 	 */
-	if ((mapp->dam_flags & DAM_SPEND) || (taskq_dispatch(system_taskq,
-	    dam_stabilize_map, mapp, TQ_NOSLEEP) == NULL)) {
+	if ((mapp->dam_flags & DAM_SPEND) ||
+	    (taskq_dispatch(system_taskq, dam_stabilize_map, mapp,
+	    TQ_NOSLEEP | TQ_NOQUEUE) == NULL)) {
 		DAM_INCR_STAT(mapp, dam_overrun);
 		mapp->dam_stable_overrun++;
-		dam_sched_tmo(mapp, mapp->dam_stabletmo, dam_addrset_stable_cb);
+		dam_sched_tmo(mapp,
+		    drv_usectohz(damap_taskq_dispatch_retry_usec),
+		    dam_addrset_stable_cb);
+
 		DTRACE_PROBE2(damap__map__addrset__stable__overrun, char *,
 		    mapp->dam_name, dam_t *, mapp);
 		mutex_exit(&mapp->dam_lock);
--- a/usr/src/uts/common/os/taskq.c	Thu Mar 04 10:02:10 2010 -0800
+++ b/usr/src/uts/common/os/taskq.c	Thu Mar 04 12:58:34 2010 -0700
@@ -564,6 +564,7 @@
 static void taskq_ent_destructor(void *, void *);
 static taskq_ent_t *taskq_ent_alloc(taskq_t *, int);
 static void taskq_ent_free(taskq_t *, taskq_ent_t *);
+static int taskq_ent_exists(taskq_t *, task_func_t, void *);
 static taskq_ent_t *taskq_bucket_dispatch(taskq_bucket_t *, task_func_t,
     void *);
 
@@ -763,6 +764,7 @@
 	cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tq->tq_exit_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL);
 
 	tq->tq_task.tqent_next = &tq->tq_task;
 	tq->tq_task.tqent_prev = &tq->tq_task;
@@ -786,6 +788,7 @@
 	cv_destroy(&tq->tq_dispatch_cv);
 	cv_destroy(&tq->tq_exit_cv);
 	cv_destroy(&tq->tq_wait_cv);
+	cv_destroy(&tq->tq_maxalloc_cv);
 }
 
 /*ARGSUSED*/
@@ -952,8 +955,9 @@
 taskq_ent_alloc(taskq_t *tq, int flags)
 {
 	int kmflags = (flags & TQ_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
-
 	taskq_ent_t *tqe;
+	clock_t wait_time;
+	clock_t	wait_rv;
 
 	ASSERT(MUTEX_HELD(&tq->tq_lock));
 
@@ -961,29 +965,44 @@
 	 * TQ_NOALLOC allocations are allowed to use the freelist, even if
 	 * we are below tq_minalloc.
 	 */
-	if ((tqe = tq->tq_freelist) != NULL &&
+again:	if ((tqe = tq->tq_freelist) != NULL &&
 	    ((flags & TQ_NOALLOC) || tq->tq_nalloc >= tq->tq_minalloc)) {
 		tq->tq_freelist = tqe->tqent_next;
 	} else {
 		if (flags & TQ_NOALLOC)
 			return (NULL);
 
-		mutex_exit(&tq->tq_lock);
 		if (tq->tq_nalloc >= tq->tq_maxalloc) {
-			if (kmflags & KM_NOSLEEP) {
-				mutex_enter(&tq->tq_lock);
+			if (kmflags & KM_NOSLEEP)
 				return (NULL);
-			}
+
 			/*
 			 * We don't want to exceed tq_maxalloc, but we can't
 			 * wait for other tasks to complete (and thus free up
 			 * task structures) without risking deadlock with
 			 * the caller.  So, we just delay for one second
-			 * to throttle the allocation rate.
+			 * to throttle the allocation rate. If we have tasks
+			 * complete before one second timeout expires then
+			 * taskq_ent_free will signal us and we will
+			 * immediately retry the allocation (reap free).
 			 */
-			delay(hz);
+			wait_time = ddi_get_lbolt() + hz;
+			while (tq->tq_freelist == NULL) {
+				tq->tq_maxalloc_wait++;
+				wait_rv = cv_timedwait(&tq->tq_maxalloc_cv,
+				    &tq->tq_lock, wait_time);
+				tq->tq_maxalloc_wait--;
+				if (wait_rv == -1)
+					break;
+			}
+			if (tq->tq_freelist)
+				goto again;		/* reap freelist */
+
 		}
+		mutex_exit(&tq->tq_lock);
+
 		tqe = kmem_cache_alloc(taskq_ent_cache, kmflags);
+
 		mutex_enter(&tq->tq_lock);
 		if (tqe != NULL)
 			tq->tq_nalloc++;
@@ -1013,6 +1032,30 @@
 		kmem_cache_free(taskq_ent_cache, tqe);
 		mutex_enter(&tq->tq_lock);
 	}
+
+	if (tq->tq_maxalloc_wait)
+		cv_signal(&tq->tq_maxalloc_cv);
+}
+
+/*
+ * taskq_ent_exists()
+ *
+ * Return 1 if taskq already has entry for calling 'func(arg)'.
+ *
+ * Assumes: tq->tq_lock is held.
+ */
+static int
+taskq_ent_exists(taskq_t *tq, task_func_t func, void *arg)
+{
+	taskq_ent_t	*tqe;
+
+	ASSERT(MUTEX_HELD(&tq->tq_lock));
+
+	for (tqe = tq->tq_task.tqent_next; tqe != &tq->tq_task;
+	    tqe = tqe->tqent_next)
+		if ((tqe->tqent_func == func) && (tqe->tqent_arg == arg))
+			return (1);
+	return (0);
 }
 
 /*
@@ -1199,15 +1242,19 @@
 	}
 
 	ASSERT(bucket != NULL);
+
 	/*
-	 * Since there are not enough free entries in the bucket, extend it
-	 * in the background using backing queue.
+	 * Since there are not enough free entries in the bucket, add a
+	 * taskq entry to extend it in the background using backing queue
+	 * (unless we already have a taskq entry to perform that extension).
 	 */
 	mutex_enter(&tq->tq_lock);
-	if ((tqe1 = taskq_ent_alloc(tq, TQ_NOSLEEP)) != NULL) {
-		TQ_ENQUEUE(tq, tqe1, taskq_bucket_extend, bucket);
-	} else {
-		TQ_STAT(bucket, tqs_nomem);
+	if (!taskq_ent_exists(tq, taskq_bucket_extend, bucket)) {
+		if ((tqe1 = taskq_ent_alloc(tq, TQ_NOSLEEP)) != NULL) {
+			TQ_ENQUEUE_FRONT(tq, tqe1, taskq_bucket_extend, bucket);
+		} else {
+			TQ_STAT(bucket, tqs_nomem);
+		}
 	}
 
 	/*
--- a/usr/src/uts/common/sys/taskq_impl.h	Thu Mar 04 10:02:10 2010 -0800
+++ b/usr/src/uts/common/sys/taskq_impl.h	Thu Mar 04 12:58:34 2010 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -111,6 +111,8 @@
 	int		tq_nalloc;
 	int		tq_minalloc;
 	int		tq_maxalloc;
+	kcondvar_t	tq_maxalloc_cv;
+	int		tq_maxalloc_wait;
 	taskq_ent_t	*tq_freelist;
 	taskq_ent_t	tq_task;
 	int		tq_maxsize;