6900839 taskq_ent_alloc: TQ_SLEEP performance cliff when tq_nalloc > tq_maxalloc
6928985 taskq filled with taskq_bucket_extend calls will not help
6928184 Anago: On takeover, one node fails to rejoin cluster, appliance kit won't start on the other node
--- a/usr/src/lib/libzpool/common/taskq.c Thu Mar 04 10:02:10 2010 -0800
+++ b/usr/src/lib/libzpool/common/taskq.c Thu Mar 04 12:58:34 2010 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -49,6 +49,8 @@
int tq_nalloc;
int tq_minalloc;
int tq_maxalloc;
+ kcondvar_t tq_maxalloc_cv;
+ int tq_maxalloc_wait;
task_t *tq_freelist;
task_t tq_task;
};
@@ -57,26 +59,36 @@
task_alloc(taskq_t *tq, int tqflags)
{
task_t *t;
+ int rv;
- if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
+again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
tq->tq_freelist = t->task_next;
} else {
- mutex_exit(&tq->tq_lock);
if (tq->tq_nalloc >= tq->tq_maxalloc) {
- if (!(tqflags & KM_SLEEP)) {
- mutex_enter(&tq->tq_lock);
+ if (!(tqflags & KM_SLEEP))
return (NULL);
- }
+
/*
* We don't want to exceed tq_maxalloc, but we can't
* wait for other tasks to complete (and thus free up
* task structures) without risking deadlock with
* the caller. So, we just delay for one second
- * to throttle the allocation rate.
+ * to throttle the allocation rate. If we have tasks
+ * complete before one second timeout expires then
+ * taskq_ent_free will signal us and we will
+ * immediately retry the allocation.
*/
- delay(hz);
+ tq->tq_maxalloc_wait++;
+ rv = cv_timedwait(&tq->tq_maxalloc_cv,
+ &tq->tq_lock, ddi_get_lbolt() + hz);
+ tq->tq_maxalloc_wait--;
+ if (rv > 0)
+ goto again; /* signaled */
}
+ mutex_exit(&tq->tq_lock);
+
t = kmem_alloc(sizeof (task_t), tqflags);
+
mutex_enter(&tq->tq_lock);
if (t != NULL)
tq->tq_nalloc++;
@@ -96,6 +108,9 @@
kmem_free(t, sizeof (task_t));
mutex_enter(&tq->tq_lock);
}
+
+ if (tq->tq_maxalloc_wait)
+ cv_signal(&tq->tq_maxalloc_cv);
}
taskqid_t
@@ -196,6 +211,7 @@
mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL);
tq->tq_flags = flags | TASKQ_ACTIVE;
tq->tq_active = nthreads;
tq->tq_nthreads = nthreads;
@@ -252,6 +268,7 @@
mutex_destroy(&tq->tq_lock);
cv_destroy(&tq->tq_dispatch_cv);
cv_destroy(&tq->tq_wait_cv);
+ cv_destroy(&tq->tq_maxalloc_cv);
kmem_free(tq, sizeof (taskq_t));
}
--- a/usr/src/uts/common/os/damap.c Thu Mar 04 10:02:10 2010 -0800
+++ b/usr/src/uts/common/os/damap.c Thu Mar 04 12:58:34 2010 -0700
@@ -84,6 +84,8 @@
*/
#define DAM_SIZE_BUMP 64
+int damap_taskq_dispatch_retry_usec = 1000;
+
/*
* config/unconfig taskq data
*/
@@ -1052,7 +1054,7 @@
tqd->tqd_mapp = mapp;
tqd->tqd_id = i;
(void) taskq_dispatch(tqp, dam_tq_config,
- tqd, KM_SLEEP);
+ tqd, TQ_SLEEP);
}
}
}
@@ -1097,7 +1099,7 @@
tqd->tqd_mapp = mapp;
tqd->tqd_id = i;
(void) taskq_dispatch(tqp,
- dam_tq_unconfig, tqd, KM_SLEEP);
+ dam_tq_unconfig, tqd, TQ_SLEEP);
}
}
}
@@ -1330,12 +1332,21 @@
*/
if (spend) {
if (taskq_dispatch(system_taskq, dam_stabilize_map,
- mapp, TQ_NOSLEEP)) {
+ mapp, TQ_NOSLEEP | TQ_NOQUEUE)) {
mapp->dam_flags |= DAM_SPEND;
DTRACE_PROBE2(damap__map__addr__stable__start, char *,
mapp->dam_name, dam_t *, mapp);
} else {
tpend++;
+
+ /*
+ * Avoid waiting the entire stabalization
+ * time again if taskq_diskpatch fails.
+ */
+ tmo_delta = drv_usectohz(
+ damap_taskq_dispatch_retry_usec);
+ if (tmo_delta < next_tmov)
+ next_tmov = tmo_delta;
}
}
@@ -1371,11 +1382,15 @@
* else dispatch the task to configure the new stable set of
* addresses.
*/
- if ((mapp->dam_flags & DAM_SPEND) || (taskq_dispatch(system_taskq,
- dam_stabilize_map, mapp, TQ_NOSLEEP) == NULL)) {
+ if ((mapp->dam_flags & DAM_SPEND) ||
+ (taskq_dispatch(system_taskq, dam_stabilize_map, mapp,
+ TQ_NOSLEEP | TQ_NOQUEUE) == NULL)) {
DAM_INCR_STAT(mapp, dam_overrun);
mapp->dam_stable_overrun++;
- dam_sched_tmo(mapp, mapp->dam_stabletmo, dam_addrset_stable_cb);
+ dam_sched_tmo(mapp,
+ drv_usectohz(damap_taskq_dispatch_retry_usec),
+ dam_addrset_stable_cb);
+
DTRACE_PROBE2(damap__map__addrset__stable__overrun, char *,
mapp->dam_name, dam_t *, mapp);
mutex_exit(&mapp->dam_lock);
--- a/usr/src/uts/common/os/taskq.c Thu Mar 04 10:02:10 2010 -0800
+++ b/usr/src/uts/common/os/taskq.c Thu Mar 04 12:58:34 2010 -0700
@@ -564,6 +564,7 @@
static void taskq_ent_destructor(void *, void *);
static taskq_ent_t *taskq_ent_alloc(taskq_t *, int);
static void taskq_ent_free(taskq_t *, taskq_ent_t *);
+static int taskq_ent_exists(taskq_t *, task_func_t, void *);
static taskq_ent_t *taskq_bucket_dispatch(taskq_bucket_t *, task_func_t,
void *);
@@ -763,6 +764,7 @@
cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
cv_init(&tq->tq_exit_cv, NULL, CV_DEFAULT, NULL);
cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL);
tq->tq_task.tqent_next = &tq->tq_task;
tq->tq_task.tqent_prev = &tq->tq_task;
@@ -786,6 +788,7 @@
cv_destroy(&tq->tq_dispatch_cv);
cv_destroy(&tq->tq_exit_cv);
cv_destroy(&tq->tq_wait_cv);
+ cv_destroy(&tq->tq_maxalloc_cv);
}
/*ARGSUSED*/
@@ -952,8 +955,9 @@
taskq_ent_alloc(taskq_t *tq, int flags)
{
int kmflags = (flags & TQ_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
-
taskq_ent_t *tqe;
+ clock_t wait_time;
+ clock_t wait_rv;
ASSERT(MUTEX_HELD(&tq->tq_lock));
@@ -961,29 +965,44 @@
* TQ_NOALLOC allocations are allowed to use the freelist, even if
* we are below tq_minalloc.
*/
- if ((tqe = tq->tq_freelist) != NULL &&
+again: if ((tqe = tq->tq_freelist) != NULL &&
((flags & TQ_NOALLOC) || tq->tq_nalloc >= tq->tq_minalloc)) {
tq->tq_freelist = tqe->tqent_next;
} else {
if (flags & TQ_NOALLOC)
return (NULL);
- mutex_exit(&tq->tq_lock);
if (tq->tq_nalloc >= tq->tq_maxalloc) {
- if (kmflags & KM_NOSLEEP) {
- mutex_enter(&tq->tq_lock);
+ if (kmflags & KM_NOSLEEP)
return (NULL);
- }
+
/*
* We don't want to exceed tq_maxalloc, but we can't
* wait for other tasks to complete (and thus free up
* task structures) without risking deadlock with
* the caller. So, we just delay for one second
- * to throttle the allocation rate.
+ * to throttle the allocation rate. If we have tasks
+ * complete before one second timeout expires then
+ * taskq_ent_free will signal us and we will
+ * immediately retry the allocation (reap free).
*/
- delay(hz);
+ wait_time = ddi_get_lbolt() + hz;
+ while (tq->tq_freelist == NULL) {
+ tq->tq_maxalloc_wait++;
+ wait_rv = cv_timedwait(&tq->tq_maxalloc_cv,
+ &tq->tq_lock, wait_time);
+ tq->tq_maxalloc_wait--;
+ if (wait_rv == -1)
+ break;
+ }
+ if (tq->tq_freelist)
+ goto again; /* reap freelist */
+
}
+ mutex_exit(&tq->tq_lock);
+
tqe = kmem_cache_alloc(taskq_ent_cache, kmflags);
+
mutex_enter(&tq->tq_lock);
if (tqe != NULL)
tq->tq_nalloc++;
@@ -1013,6 +1032,30 @@
kmem_cache_free(taskq_ent_cache, tqe);
mutex_enter(&tq->tq_lock);
}
+
+ if (tq->tq_maxalloc_wait)
+ cv_signal(&tq->tq_maxalloc_cv);
+}
+
+/*
+ * taskq_ent_exists()
+ *
+ * Return 1 if taskq already has entry for calling 'func(arg)'.
+ *
+ * Assumes: tq->tq_lock is held.
+ */
+static int
+taskq_ent_exists(taskq_t *tq, task_func_t func, void *arg)
+{
+ taskq_ent_t *tqe;
+
+ ASSERT(MUTEX_HELD(&tq->tq_lock));
+
+ for (tqe = tq->tq_task.tqent_next; tqe != &tq->tq_task;
+ tqe = tqe->tqent_next)
+ if ((tqe->tqent_func == func) && (tqe->tqent_arg == arg))
+ return (1);
+ return (0);
}
/*
@@ -1199,15 +1242,19 @@
}
ASSERT(bucket != NULL);
+
/*
- * Since there are not enough free entries in the bucket, extend it
- * in the background using backing queue.
+ * Since there are not enough free entries in the bucket, add a
+ * taskq entry to extend it in the background using backing queue
+ * (unless we already have a taskq entry to perform that extension).
*/
mutex_enter(&tq->tq_lock);
- if ((tqe1 = taskq_ent_alloc(tq, TQ_NOSLEEP)) != NULL) {
- TQ_ENQUEUE(tq, tqe1, taskq_bucket_extend, bucket);
- } else {
- TQ_STAT(bucket, tqs_nomem);
+ if (!taskq_ent_exists(tq, taskq_bucket_extend, bucket)) {
+ if ((tqe1 = taskq_ent_alloc(tq, TQ_NOSLEEP)) != NULL) {
+ TQ_ENQUEUE_FRONT(tq, tqe1, taskq_bucket_extend, bucket);
+ } else {
+ TQ_STAT(bucket, tqs_nomem);
+ }
}
/*
--- a/usr/src/uts/common/sys/taskq_impl.h Thu Mar 04 10:02:10 2010 -0800
+++ b/usr/src/uts/common/sys/taskq_impl.h Thu Mar 04 12:58:34 2010 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -111,6 +111,8 @@
int tq_nalloc;
int tq_minalloc;
int tq_maxalloc;
+ kcondvar_t tq_maxalloc_cv;
+ int tq_maxalloc_wait;
taskq_ent_t *tq_freelist;
taskq_ent_t tq_task;
int tq_maxsize;