usr/src/uts/common/fs/zfs/spa.c
changeset 11173 87f3734e64df
parent 11149 8bad7424e2c2
child 11422 42768837421d
--- a/usr/src/uts/common/fs/zfs/spa.c	Mon Nov 23 16:18:43 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/spa.c	Mon Nov 23 15:29:44 2009 -0800
@@ -62,24 +62,28 @@
 #include <sys/zfs_ioctl.h>
 
 #ifdef	_KERNEL
+#include <sys/bootprops.h>
+#include <sys/callb.h>
+#include <sys/cpupart.h>
+#include <sys/pool.h>
+#include <sys/sysdc.h>
 #include <sys/zone.h>
-#include <sys/bootprops.h>
 #endif	/* _KERNEL */
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
-enum zti_modes {
+typedef enum zti_modes {
 	zti_mode_fixed,			/* value is # of threads (min 1) */
 	zti_mode_online_percent,	/* value is % of online CPUs */
-	zti_mode_tune,			/* fill from zio_taskq_tune_* */
+	zti_mode_batch,			/* cpu-intensive; value is ignored */
 	zti_mode_null,			/* don't create a taskq */
 	zti_nmodes
-};
+} zti_modes_t;
 
 #define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
 #define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
-#define	ZTI_TUNE	{ zti_mode_tune, 0 }
+#define	ZTI_BATCH	{ zti_mode_batch, 0 }
 #define	ZTI_NULL	{ zti_mode_null, 0 }
 
 #define	ZTI_ONE		ZTI_FIX(1)
@@ -90,7 +94,7 @@
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
-		"issue", "issue_high", "intr", "intr_high"
+	"issue", "issue_high", "intr", "intr_high"
 };
 
 /*
@@ -100,19 +104,29 @@
 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
-	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_TUNE,	ZTI_NULL },
-	{ ZTI_TUNE,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
+	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL },
+	{ ZTI_BATCH,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 };
 
-enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
-uint_t zio_taskq_tune_value = 80;	/* #threads = 80% of # online CPUs */
-
 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 
+uint_t		zio_taskq_batch_pct = 100;	/* 1 thread per cpu in pset */
+id_t		zio_taskq_psrset_bind = PS_NONE;
+boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
+uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
+
+boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define	TRYIMPORT_NAME	"$import"
+
 /*
  * ==========================================================================
  * SPA properties routines
@@ -584,6 +598,139 @@
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
+static taskq_t *
+spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
+    uint_t value)
+{
+	uint_t flags = TASKQ_PREPOPULATE;
+	boolean_t batch = B_FALSE;
+
+	switch (mode) {
+	case zti_mode_null:
+		return (NULL);		/* no taskq needed */
+
+	case zti_mode_fixed:
+		ASSERT3U(value, >=, 1);
+		value = MAX(value, 1);
+		break;
+
+	case zti_mode_batch:
+		batch = B_TRUE;
+		flags |= TASKQ_THREADS_CPU_PCT;
+		value = zio_taskq_batch_pct;
+		break;
+
+	case zti_mode_online_percent:
+		flags |= TASKQ_THREADS_CPU_PCT;
+		break;
+
+	default:
+		panic("unrecognized mode for %s taskq (%u:%u) in "
+		    "spa_activate()",
+		    name, mode, value);
+		break;
+	}
+
+	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+		if (batch)
+			flags |= TASKQ_DC_BATCH;
+
+		return (taskq_create_sysdc(name, value, 50, INT_MAX,
+		    spa->spa_proc, zio_taskq_basedc, flags));
+	}
+	return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
+	    spa->spa_proc, flags));
+}
+
+static void
+spa_create_zio_taskqs(spa_t *spa)
+{
+	for (int t = 0; t < ZIO_TYPES; t++) {
+		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+			enum zti_modes mode = ztip->zti_mode;
+			uint_t value = ztip->zti_value;
+			char name[32];
+
+			(void) snprintf(name, sizeof (name),
+			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
+
+			spa->spa_zio_taskq[t][q] =
+			    spa_taskq_create(spa, name, mode, value);
+		}
+	}
+}
+
+#ifdef _KERNEL
+static void
+spa_thread(void *arg)
+{
+	callb_cpr_t cprinfo;
+
+	spa_t *spa = arg;
+	user_t *pu = PTOU(curproc);
+
+	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
+	    spa->spa_name);
+
+	ASSERT(curproc != &p0);
+	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
+	    "zpool-%s", spa->spa_name);
+	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
+
+	/* bind this thread to the requested psrset */
+	if (zio_taskq_psrset_bind != PS_NONE) {
+		pool_lock();
+		mutex_enter(&cpu_lock);
+		mutex_enter(&pidlock);
+		mutex_enter(&curproc->p_lock);
+
+		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
+		    0, NULL, NULL) == 0)  {
+			curthread->t_bind_pset = zio_taskq_psrset_bind;
+		} else {
+			cmn_err(CE_WARN,
+			    "Couldn't bind process for zfs pool \"%s\" to "
+			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
+		}
+
+		mutex_exit(&curproc->p_lock);
+		mutex_exit(&pidlock);
+		mutex_exit(&cpu_lock);
+		pool_unlock();
+	}
+
+	if (zio_taskq_sysdc) {
+		sysdc_thread_enter(curthread, 100, 0);
+	}
+
+	spa->spa_proc = curproc;
+	spa->spa_did = curthread->t_did;
+
+	spa_create_zio_taskqs(spa);
+
+	mutex_enter(&spa->spa_proc_lock);
+	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
+
+	spa->spa_proc_state = SPA_PROC_ACTIVE;
+	cv_broadcast(&spa->spa_proc_cv);
+
+	CALLB_CPR_SAFE_BEGIN(&cprinfo);
+	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
+		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
+
+	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
+	spa->spa_proc_state = SPA_PROC_GONE;
+	spa->spa_proc = &p0;
+	cv_broadcast(&spa->spa_proc_cv);
+	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
+
+	mutex_enter(&curproc->p_lock);
+	lwp_exit();
+}
+#endif
+
 /*
  * Activate an uninitialized pool.
  */
@@ -598,53 +745,38 @@
 	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
 	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
 
-	for (int t = 0; t < ZIO_TYPES; t++) {
-		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
-			enum zti_modes mode = ztip->zti_mode;
-			uint_t value = ztip->zti_value;
-			char name[32];
-
-			(void) snprintf(name, sizeof (name),
-			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
-
-			if (mode == zti_mode_tune) {
-				mode = zio_taskq_tune_mode;
-				value = zio_taskq_tune_value;
-				if (mode == zti_mode_tune)
-					mode = zti_mode_online_percent;
+	/* Try to create a covering process */
+	mutex_enter(&spa->spa_proc_lock);
+	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
+	ASSERT(spa->spa_proc == &p0);
+	spa->spa_did = 0;
+
+	/* Only create a process if we're going to be around a while. */
+	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
+		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
+		    NULL, 0) == 0) {
+			spa->spa_proc_state = SPA_PROC_CREATED;
+			while (spa->spa_proc_state == SPA_PROC_CREATED) {
+				cv_wait(&spa->spa_proc_cv,
+				    &spa->spa_proc_lock);
 			}
-
-			switch (mode) {
-			case zti_mode_fixed:
-				ASSERT3U(value, >=, 1);
-				value = MAX(value, 1);
-
-				spa->spa_zio_taskq[t][q] = taskq_create(name,
-				    value, maxclsyspri, 50, INT_MAX,
-				    TASKQ_PREPOPULATE);
-				break;
-
-			case zti_mode_online_percent:
-				spa->spa_zio_taskq[t][q] = taskq_create(name,
-				    value, maxclsyspri, 50, INT_MAX,
-				    TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
-				break;
-
-			case zti_mode_null:
-				spa->spa_zio_taskq[t][q] = NULL;
-				break;
-
-			case zti_mode_tune:
-			default:
-				panic("unrecognized mode for "
-				    "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) "
-				    "in spa_activate()",
-				    t, q, mode, value);
-				break;
-			}
+			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+			ASSERT(spa->spa_proc != &p0);
+			ASSERT(spa->spa_did != 0);
+		} else {
+#ifdef _KERNEL
+			cmn_err(CE_WARN,
+			    "Couldn't create process for zfs pool \"%s\"\n",
+			    spa->spa_name);
+#endif
 		}
 	}
+	mutex_exit(&spa->spa_proc_lock);
+
+	/* If we didn't create a process, we need to create our taskqs. */
+	if (spa->spa_proc == &p0) {
+		spa_create_zio_taskqs(spa);
+	}
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
@@ -703,6 +835,31 @@
 	avl_destroy(&spa->spa_errlist_last);
 
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
+
+	mutex_enter(&spa->spa_proc_lock);
+	if (spa->spa_proc_state != SPA_PROC_NONE) {
+		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
+		cv_broadcast(&spa->spa_proc_cv);
+		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
+			ASSERT(spa->spa_proc != &p0);
+			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+		}
+		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
+		spa->spa_proc_state = SPA_PROC_NONE;
+	}
+	ASSERT(spa->spa_proc == &p0);
+	mutex_exit(&spa->spa_proc_lock);
+
+	/*
+	 * We want to make sure spa_thread() has actually exited the ZFS
+	 * module, so that the module can't be unloaded out from underneath
+	 * it.
+	 */
+	if (spa->spa_did != 0) {
+		thread_join(spa->spa_did);
+		spa->spa_did = 0;
+	}
 }
 
 /*
@@ -2999,13 +3156,6 @@
 	return (0);
 }
 
-
-/*
- * This (illegal) pool name is used when temporarily importing a spa_t in order
- * to get the vdev stats associated with the imported devices.
- */
-#define	TRYIMPORT_NAME	"$import"
-
 nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {