usr/src/uts/common/fs/zfs/vdev.c
changeset 10922 e2081f502306
parent 10921 8aac17999e4d
child 10974 32d689ba6466
equal deleted inserted replaced
10921:8aac17999e4d 10922:e2081f502306
   407 	 */
   407 	 */
   408 	nparity = -1ULL;
   408 	nparity = -1ULL;
   409 	if (ops == &vdev_raidz_ops) {
   409 	if (ops == &vdev_raidz_ops) {
   410 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
   410 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
   411 		    &nparity) == 0) {
   411 		    &nparity) == 0) {
   412 			/*
   412 			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
   413 			 * Currently, we can only support 3 parity devices.
       
   414 			 */
       
   415 			if (nparity == 0 || nparity > 3)
       
   416 				return (EINVAL);
   413 				return (EINVAL);
   417 			/*
   414 			/*
   418 			 * Previous versions could only support 1 or 2 parity
   415 			 * Previous versions could only support 1 or 2 parity
   419 			 * device.
   416 			 * device.
   420 			 */
   417 			 */
   565 	 * trying to ensure complicated semantics for all callers.
   562 	 * trying to ensure complicated semantics for all callers.
   566 	 */
   563 	 */
   567 	vdev_close(vd);
   564 	vdev_close(vd);
   568 
   565 
   569 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
   566 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
       
   567 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
   570 
   568 
   571 	/*
   569 	/*
   572 	 * Free all children.
   570 	 * Free all children.
   573 	 */
   571 	 */
   574 	for (int c = 0; c < vd->vdev_children; c++)
   572 	for (int c = 0; c < vd->vdev_children; c++)
   814 	    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
   812 	    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
   815 
   813 
   816 	ASSERT(oldc <= newc);
   814 	ASSERT(oldc <= newc);
   817 
   815 
   818 	if (vd->vdev_islog)
   816 	if (vd->vdev_islog)
   819 		mc = spa->spa_log_class;
   817 		mc = spa_log_class(spa);
   820 	else
   818 	else
   821 		mc = spa->spa_normal_class;
   819 		mc = spa_normal_class(spa);
   822 
   820 
   823 	if (vd->vdev_mg == NULL)
   821 	if (vd->vdev_mg == NULL)
   824 		vd->vdev_mg = metaslab_group_create(mc, vd);
   822 		vd->vdev_mg = metaslab_group_create(mc, vd);
   825 
   823 
   826 	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
   824 	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
  1571 
  1569 
  1572 	for (int c = 0; c < vd->vdev_children; c++)
  1570 	for (int c = 0; c < vd->vdev_children; c++)
  1573 		vdev_dtl_reassess(vd->vdev_child[c], txg,
  1571 		vdev_dtl_reassess(vd->vdev_child[c], txg,
  1574 		    scrub_txg, scrub_done);
  1572 		    scrub_txg, scrub_done);
  1575 
  1573 
  1576 	if (vd == spa->spa_root_vdev || vd->vdev_ishole)
  1574 	if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
  1577 		return;
  1575 		return;
  1578 
  1576 
  1579 	if (vd->vdev_ops->vdev_op_leaf) {
  1577 	if (vd->vdev_ops->vdev_op_leaf) {
  1580 		mutex_enter(&vd->vdev_dtl_lock);
  1578 		mutex_enter(&vd->vdev_dtl_lock);
  1581 		if (scrub_txg != 0 &&
  1579 		if (scrub_txg != 0 &&
  2169 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
  2167 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
  2170 		    vdev_dtl_required(vd))
  2168 		    vdev_dtl_required(vd))
  2171 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
  2169 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
  2172 
  2170 
  2173 		/*
  2171 		/*
  2174 		 * If the top-level is a slog and it's had allocations
  2172 		 * If the top-level is a slog and it has had allocations
  2175 		 * then proceed. We check that the vdev's metaslab
  2173 		 * then proceed.  We check that the vdev's metaslab group
  2176 		 * grop is not NULL since it's possible that we may
  2174 		 * is not NULL since it's possible that we may have just
  2177 		 * have just added this vdev and have not yet initialized
  2175 		 * added this vdev but not yet initialized its metaslabs.
  2178 		 * it's metaslabs.
       
  2179 		 */
  2176 		 */
  2180 		if (tvd->vdev_islog && mg != NULL) {
  2177 		if (tvd->vdev_islog && mg != NULL) {
  2181 			/*
  2178 			/*
  2182 			 * Prevent any future allocations.
  2179 			 * Prevent any future allocations.
  2183 			 */
  2180 			 */
  2494 		vs->vs_write_errors++;
  2491 		vs->vs_write_errors++;
  2495 	mutex_exit(&vd->vdev_stat_lock);
  2492 	mutex_exit(&vd->vdev_stat_lock);
  2496 
  2493 
  2497 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
  2494 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
  2498 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
  2495 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
  2499 	    (flags & ZIO_FLAG_SCRUB_THREAD))) {
  2496 	    (flags & ZIO_FLAG_SCRUB_THREAD) ||
       
  2497 	    spa->spa_claiming)) {
  2500 		/*
  2498 		/*
  2501 		 * This is either a normal write (not a repair), or it's a
  2499 		 * This is either a normal write (not a repair), or it's
  2502 		 * repair induced by the scrub thread.  In the normal case,
  2500 		 * a repair induced by the scrub thread, or it's a repair
  2503 		 * we commit the DTL change in the same txg as the block
  2501 		 * made by zil_claim() during spa_load() in the first txg.
  2504 		 * was born.  In the scrub-induced repair case, we know that
  2502 		 * In the normal case, we commit the DTL change in the same
  2505 		 * scrubs run in first-pass syncing context, so we commit
  2503 		 * txg as the block was born.  In the scrub-induced repair
  2506 		 * the DTL change in spa->spa_syncing_txg.
  2504 		 * case, we know that scrubs run in first-pass syncing context,
       
  2505 		 * so we commit the DTL change in spa_syncing_txg(spa).
       
  2506 		 * In the zil_claim() case, we commit in spa_first_txg(spa).
  2507 		 *
  2507 		 *
  2508 		 * We currently do not make DTL entries for failed spontaneous
  2508 		 * We currently do not make DTL entries for failed spontaneous
  2509 		 * self-healing writes triggered by normal (non-scrubbing)
  2509 		 * self-healing writes triggered by normal (non-scrubbing)
  2510 		 * reads, because we have no transactional context in which to
  2510 		 * reads, because we have no transactional context in which to
  2511 		 * do so -- and it's not clear that it'd be desirable anyway.
  2511 		 * do so -- and it's not clear that it'd be desirable anyway.
  2514 			uint64_t commit_txg = txg;
  2514 			uint64_t commit_txg = txg;
  2515 			if (flags & ZIO_FLAG_SCRUB_THREAD) {
  2515 			if (flags & ZIO_FLAG_SCRUB_THREAD) {
  2516 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
  2516 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
  2517 				ASSERT(spa_sync_pass(spa) == 1);
  2517 				ASSERT(spa_sync_pass(spa) == 1);
  2518 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
  2518 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
  2519 				commit_txg = spa->spa_syncing_txg;
  2519 				commit_txg = spa_syncing_txg(spa);
       
  2520 			} else if (spa->spa_claiming) {
       
  2521 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
       
  2522 				commit_txg = spa_first_txg(spa);
  2520 			}
  2523 			}
  2521 			ASSERT(commit_txg >= spa->spa_syncing_txg);
  2524 			ASSERT(commit_txg >= spa_syncing_txg(spa));
  2522 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
  2525 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
  2523 				return;
  2526 				return;
  2524 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
  2527 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
  2525 				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
  2528 				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
  2526 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
  2529 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
  2558 
  2561 
  2559 	mutex_exit(&vd->vdev_stat_lock);
  2562 	mutex_exit(&vd->vdev_stat_lock);
  2560 }
  2563 }
  2561 
  2564 
  2562 /*
  2565 /*
  2563  * Update the in-core space usage stats for this vdev and the root vdev.
  2566  * Update the in-core space usage stats for this vdev, its metaslab class,
  2564  */
  2567  * and the root vdev.
  2565 void
  2568  */
  2566 vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
  2569 void
  2567     int64_t defer_delta, boolean_t update_root)
  2570 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
       
  2571     int64_t space_delta)
  2568 {
  2572 {
  2569 	int64_t dspace_delta = space_delta;
  2573 	int64_t dspace_delta = space_delta;
  2570 	spa_t *spa = vd->vdev_spa;
  2574 	spa_t *spa = vd->vdev_spa;
  2571 	vdev_t *rvd = spa->spa_root_vdev;
  2575 	vdev_t *rvd = spa->spa_root_vdev;
       
  2576 	metaslab_group_t *mg = vd->vdev_mg;
       
  2577 	metaslab_class_t *mc = mg ? mg->mg_class : NULL;
  2572 
  2578 
  2573 	ASSERT(vd == vd->vdev_top);
  2579 	ASSERT(vd == vd->vdev_top);
  2574 
  2580 
  2575 	/*
  2581 	/*
  2576 	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
  2582 	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
  2582 	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
  2588 	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
  2583 	dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
  2589 	dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
  2584 	    vd->vdev_deflate_ratio;
  2590 	    vd->vdev_deflate_ratio;
  2585 
  2591 
  2586 	mutex_enter(&vd->vdev_stat_lock);
  2592 	mutex_enter(&vd->vdev_stat_lock);
       
  2593 	vd->vdev_stat.vs_alloc += alloc_delta;
  2587 	vd->vdev_stat.vs_space += space_delta;
  2594 	vd->vdev_stat.vs_space += space_delta;
  2588 	vd->vdev_stat.vs_alloc += alloc_delta;
       
  2589 	vd->vdev_stat.vs_dspace += dspace_delta;
  2595 	vd->vdev_stat.vs_dspace += dspace_delta;
  2590 	vd->vdev_stat.vs_defer += defer_delta;
       
  2591 	mutex_exit(&vd->vdev_stat_lock);
  2596 	mutex_exit(&vd->vdev_stat_lock);
  2592 
  2597 
  2593 	if (update_root) {
  2598 	if (mc == spa_normal_class(spa)) {
       
  2599 		mutex_enter(&rvd->vdev_stat_lock);
       
  2600 		rvd->vdev_stat.vs_alloc += alloc_delta;
       
  2601 		rvd->vdev_stat.vs_space += space_delta;
       
  2602 		rvd->vdev_stat.vs_dspace += dspace_delta;
       
  2603 		mutex_exit(&rvd->vdev_stat_lock);
       
  2604 	}
       
  2605 
       
  2606 	if (mc != NULL) {
  2594 		ASSERT(rvd == vd->vdev_parent);
  2607 		ASSERT(rvd == vd->vdev_parent);
  2595 		ASSERT(vd->vdev_ms_count != 0);
  2608 		ASSERT(vd->vdev_ms_count != 0);
  2596 
  2609 
  2597 		/*
  2610 		metaslab_class_space_update(mc,
  2598 		 * Don't count non-normal (e.g. intent log) space as part of
  2611 		    alloc_delta, defer_delta, space_delta, dspace_delta);
  2599 		 * the pool's capacity.
       
  2600 		 */
       
  2601 		if (vd->vdev_islog)
       
  2602 			return;
       
  2603 
       
  2604 		mutex_enter(&rvd->vdev_stat_lock);
       
  2605 		rvd->vdev_stat.vs_space += space_delta;
       
  2606 		rvd->vdev_stat.vs_alloc += alloc_delta;
       
  2607 		rvd->vdev_stat.vs_dspace += dspace_delta;
       
  2608 		rvd->vdev_stat.vs_defer += defer_delta;
       
  2609 		mutex_exit(&rvd->vdev_stat_lock);
       
  2610 	}
  2612 	}
  2611 }
  2613 }
  2612 
  2614 
  2613 /*
  2615 /*
  2614  * Mark a top-level vdev's config as dirty, placing it on the dirty list
  2616  * Mark a top-level vdev's config as dirty, placing it on the dirty list
  2720 	 */
  2722 	 */
  2721 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
  2723 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
  2722 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
  2724 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
  2723 	    spa_config_held(spa, SCL_STATE, RW_READER)));
  2725 	    spa_config_held(spa, SCL_STATE, RW_READER)));
  2724 
  2726 
  2725 	if (!list_link_active(&vd->vdev_state_dirty_node))
  2727 	if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
  2726 		list_insert_head(&spa->spa_state_dirty_list, vd);
  2728 		list_insert_head(&spa->spa_state_dirty_list, vd);
  2727 }
  2729 }
  2728 
  2730 
  2729 void
  2731 void
  2730 vdev_state_clean(vdev_t *vd)
  2732 vdev_state_clean(vdev_t *vd)