6538954 kmem_cache_alloc() doesn't scale for anon structure allocations when cache magazines are empty
authorDavid Valin <David.Valin@Sun.COM>
Thu, 08 Apr 2010 07:11:19 -0700
changeset 12093 fa0c0f5bf466
parent 12092 d8c71dc8ec0d
child 12094 2a511308c79f
6538954 kmem_cache_alloc() doesn't scale for anon structure allocations when cache magazines are empty
usr/src/uts/common/os/kmem.c
usr/src/uts/common/sys/kmem.h
usr/src/uts/common/sys/kmem_impl.h
usr/src/uts/common/vm/vm_anon.c
--- a/usr/src/uts/common/os/kmem.c	Thu Apr 08 03:07:32 2010 -0700
+++ b/usr/src/uts/common/os/kmem.c	Thu Apr 08 07:11:19 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
@@ -1133,6 +1132,7 @@
 
 static void kmem_cache_scan(kmem_cache_t *);
 static void kmem_cache_defrag(kmem_cache_t *);
+static void kmem_slab_prefill(kmem_cache_t *, kmem_slab_t *);
 
 
 kmem_log_header_t	*kmem_transaction_log;
@@ -1654,18 +1654,19 @@
 }
 
 static void *
-kmem_slab_alloc_impl(kmem_cache_t *cp, kmem_slab_t *sp)
+kmem_slab_alloc_impl(kmem_cache_t *cp, kmem_slab_t *sp, boolean_t prefill)
 {
 	kmem_bufctl_t *bcp, **hash_bucket;
 	void *buf;
+	boolean_t new_slab = (sp->slab_refcnt == 0);
 
 	ASSERT(MUTEX_HELD(&cp->cache_lock));
 	/*
 	 * kmem_slab_alloc() drops cache_lock when it creates a new slab, so we
 	 * can't ASSERT(avl_is_empty(&cp->cache_partial_slabs)) here when the
-	 * slab is newly created (sp->slab_refcnt == 0).
+	 * slab is newly created.
 	 */
-	ASSERT((sp->slab_refcnt == 0) || (KMEM_SLAB_IS_PARTIAL(sp) &&
+	ASSERT(new_slab || (KMEM_SLAB_IS_PARTIAL(sp) &&
 	    (sp == avl_first(&cp->cache_partial_slabs))));
 	ASSERT(sp->slab_cache == cp);
 
@@ -1674,31 +1675,7 @@
 	sp->slab_refcnt++;
 
 	bcp = sp->slab_head;
-	if ((sp->slab_head = bcp->bc_next) == NULL) {
-		ASSERT(KMEM_SLAB_IS_ALL_USED(sp));
-		if (sp->slab_refcnt == 1) {
-			ASSERT(sp->slab_chunks == 1);
-		} else {
-			ASSERT(sp->slab_chunks > 1); /* the slab was partial */
-			avl_remove(&cp->cache_partial_slabs, sp);
-			sp->slab_later_count = 0; /* clear history */
-			sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
-			sp->slab_stuck_offset = (uint32_t)-1;
-		}
-		list_insert_head(&cp->cache_complete_slabs, sp);
-		cp->cache_complete_slab_count++;
-	} else {
-		ASSERT(KMEM_SLAB_IS_PARTIAL(sp));
-		if (sp->slab_refcnt == 1) {
-			avl_add(&cp->cache_partial_slabs, sp);
-		} else {
-			/*
-			 * The slab is now more allocated than it was, so the
-			 * order remains unchanged.
-			 */
-			ASSERT(!avl_update(&cp->cache_partial_slabs, sp));
-		}
-	}
+	sp->slab_head = bcp->bc_next;
 
 	if (cp->cache_flags & KMF_HASH) {
 		/*
@@ -1716,6 +1693,45 @@
 	}
 
 	ASSERT(KMEM_SLAB_MEMBER(sp, buf));
+
+	if (sp->slab_head == NULL) {
+		ASSERT(KMEM_SLAB_IS_ALL_USED(sp));
+		if (new_slab) {
+			ASSERT(sp->slab_chunks == 1);
+		} else {
+			ASSERT(sp->slab_chunks > 1); /* the slab was partial */
+			avl_remove(&cp->cache_partial_slabs, sp);
+			sp->slab_later_count = 0; /* clear history */
+			sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
+			sp->slab_stuck_offset = (uint32_t)-1;
+		}
+		list_insert_head(&cp->cache_complete_slabs, sp);
+		cp->cache_complete_slab_count++;
+		return (buf);
+	}
+
+	ASSERT(KMEM_SLAB_IS_PARTIAL(sp));
+	/*
+	 * Peek to see if the magazine layer is enabled before
+	 * we prefill.  We're not holding the cpu cache lock,
+	 * so the peek could be wrong, but there's no harm in it.
+	 */
+	if (new_slab && prefill && (cp->cache_flags & KMF_PREFILL) &&
+	    (KMEM_CPU_CACHE(cp)->cc_magsize != 0))  {
+		kmem_slab_prefill(cp, sp);
+		return (buf);
+	}
+
+	if (new_slab) {
+		avl_add(&cp->cache_partial_slabs, sp);
+		return (buf);
+	}
+
+	/*
+	 * The slab is now more allocated than it was, so the
+	 * order remains unchanged.
+	 */
+	ASSERT(!avl_update(&cp->cache_partial_slabs, sp));
 	return (buf);
 }
 
@@ -1749,7 +1765,7 @@
 		cp->cache_bufslab += sp->slab_chunks;
 	}
 
-	buf = kmem_slab_alloc_impl(cp, sp);
+	buf = kmem_slab_alloc_impl(cp, sp, B_TRUE);
 	ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
 	    (cp->cache_complete_slab_count +
 	    avl_numnodes(&cp->cache_partial_slabs) +
@@ -2627,14 +2643,79 @@
 }
 
 /*
+ * Used when there's no room to free a buffer to the per-CPU cache.
+ * Drops and re-acquires &ccp->cc_lock, and returns non-zero if the
+ * caller should try freeing to the per-CPU cache again.
+ * Note that we don't directly install the magazine in the cpu cache,
+ * since its state may have changed wildly while the lock was dropped.
+ */
+static int
+kmem_cpucache_magazine_alloc(kmem_cpu_cache_t *ccp, kmem_cache_t *cp)
+{
+	kmem_magazine_t *emp;
+	kmem_magtype_t *mtp;
+
+	ASSERT(MUTEX_HELD(&ccp->cc_lock));
+	ASSERT(((uint_t)ccp->cc_rounds == ccp->cc_magsize ||
+	    ((uint_t)ccp->cc_rounds == -1)) &&
+	    ((uint_t)ccp->cc_prounds == ccp->cc_magsize ||
+	    ((uint_t)ccp->cc_prounds == -1)));
+
+	emp = kmem_depot_alloc(cp, &cp->cache_empty);
+	if (emp != NULL) {
+		if (ccp->cc_ploaded != NULL)
+			kmem_depot_free(cp, &cp->cache_full,
+			    ccp->cc_ploaded);
+		kmem_cpu_reload(ccp, emp, 0);
+		return (1);
+	}
+	/*
+	 * There are no empty magazines in the depot,
+	 * so try to allocate a new one.  We must drop all locks
+	 * across kmem_cache_alloc() because lower layers may
+	 * attempt to allocate from this cache.
+	 */
+	mtp = cp->cache_magtype;
+	mutex_exit(&ccp->cc_lock);
+	emp = kmem_cache_alloc(mtp->mt_cache, KM_NOSLEEP);
+	mutex_enter(&ccp->cc_lock);
+
+	if (emp != NULL) {
+		/*
+		 * We successfully allocated an empty magazine.
+		 * However, we had to drop ccp->cc_lock to do it,
+		 * so the cache's magazine size may have changed.
+		 * If so, free the magazine and try again.
+		 */
+		if (ccp->cc_magsize != mtp->mt_magsize) {
+			mutex_exit(&ccp->cc_lock);
+			kmem_cache_free(mtp->mt_cache, emp);
+			mutex_enter(&ccp->cc_lock);
+			return (1);
+		}
+
+		/*
+		 * We got a magazine of the right size.  Add it to
+		 * the depot and try the whole dance again.
+		 */
+		kmem_depot_free(cp, &cp->cache_empty, emp);
+		return (1);
+	}
+
+	/*
+	 * We couldn't allocate an empty magazine,
+	 * so fall through to the slab layer.
+	 */
+	return (0);
+}
+
+/*
  * Free a constructed object to cache cp.
  */
 void
 kmem_cache_free(kmem_cache_t *cp, void *buf)
 {
 	kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
-	kmem_magazine_t *emp;
-	kmem_magtype_t *mtp;
 
 	/*
 	 * The client must not free either of the buffers passed to the move
@@ -2660,6 +2741,9 @@
 	}
 
 	mutex_enter(&ccp->cc_lock);
+	/*
+	 * Any changes to this logic should be reflected in kmem_slab_prefill()
+	 */
 	for (;;) {
 		/*
 		 * If there's a slot available in the current CPU's
@@ -2687,64 +2771,110 @@
 		if (ccp->cc_magsize == 0)
 			break;
 
+		if (!kmem_cpucache_magazine_alloc(ccp, cp)) {
+			/*
+			 * We couldn't free our constructed object to the
+			 * magazine layer, so apply its destructor and free it
+			 * to the slab layer.
+			 */
+			break;
+		}
+	}
+	mutex_exit(&ccp->cc_lock);
+	kmem_slab_free_constructed(cp, buf, B_TRUE);
+}
+
+static void
+kmem_slab_prefill(kmem_cache_t *cp, kmem_slab_t *sp)
+{
+	kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
+	int cache_flags = cp->cache_flags;
+
+	kmem_bufctl_t *next, *head;
+	size_t nbufs;
+
+	/*
+	 * Completely allocate the newly created slab and put the pre-allocated
+	 * buffers in magazines. Any of the buffers that cannot be put in
+	 * magazines must be returned to the slab.
+	 */
+	ASSERT(MUTEX_HELD(&cp->cache_lock));
+	ASSERT((cache_flags & (KMF_PREFILL|KMF_BUFTAG)) == KMF_PREFILL);
+	ASSERT(cp->cache_constructor == NULL);
+	ASSERT(sp->slab_cache == cp);
+	ASSERT(sp->slab_refcnt == 1);
+	ASSERT(sp->slab_head != NULL && sp->slab_chunks > sp->slab_refcnt);
+	ASSERT(avl_find(&cp->cache_partial_slabs, sp, NULL) == NULL);
+
+	head = sp->slab_head;
+	nbufs = (sp->slab_chunks - sp->slab_refcnt);
+	sp->slab_head = NULL;
+	sp->slab_refcnt += nbufs;
+	cp->cache_bufslab -= nbufs;
+	cp->cache_slab_alloc += nbufs;
+	list_insert_head(&cp->cache_complete_slabs, sp);
+	cp->cache_complete_slab_count++;
+	mutex_exit(&cp->cache_lock);
+	mutex_enter(&ccp->cc_lock);
+
+	while (head != NULL) {
+		void *buf = KMEM_BUF(cp, head);
 		/*
-		 * Try to get an empty magazine from the depot.
+		 * If there's a slot available in the current CPU's
+		 * loaded magazine, just put the object there and
+		 * continue.
 		 */
-		emp = kmem_depot_alloc(cp, &cp->cache_empty);
-		if (emp != NULL) {
-			if (ccp->cc_ploaded != NULL)
-				kmem_depot_free(cp, &cp->cache_full,
-				    ccp->cc_ploaded);
-			kmem_cpu_reload(ccp, emp, 0);
+		if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) {
+			ccp->cc_loaded->mag_round[ccp->cc_rounds++] =
+			    buf;
+			ccp->cc_free++;
+			nbufs--;
+			head = head->bc_next;
 			continue;
 		}
 
 		/*
-		 * There are no empty magazines in the depot,
-		 * so try to allocate a new one.  We must drop all locks
-		 * across kmem_cache_alloc() because lower layers may
-		 * attempt to allocate from this cache.
+		 * The loaded magazine is full.  If the previously
+		 * loaded magazine was empty, exchange them and try
+		 * again.
 		 */
-		mtp = cp->cache_magtype;
-		mutex_exit(&ccp->cc_lock);
-		emp = kmem_cache_alloc(mtp->mt_cache, KM_NOSLEEP);
-		mutex_enter(&ccp->cc_lock);
-
-		if (emp != NULL) {
-			/*
-			 * We successfully allocated an empty magazine.
-			 * However, we had to drop ccp->cc_lock to do it,
-			 * so the cache's magazine size may have changed.
-			 * If so, free the magazine and try again.
-			 */
-			if (ccp->cc_magsize != mtp->mt_magsize) {
-				mutex_exit(&ccp->cc_lock);
-				kmem_cache_free(mtp->mt_cache, emp);
-				mutex_enter(&ccp->cc_lock);
-				continue;
-			}
-
-			/*
-			 * We got a magazine of the right size.  Add it to
-			 * the depot and try the whole dance again.
-			 */
-			kmem_depot_free(cp, &cp->cache_empty, emp);
+		if (ccp->cc_prounds == 0) {
+			kmem_cpu_reload(ccp, ccp->cc_ploaded,
+			    ccp->cc_prounds);
 			continue;
 		}
 
 		/*
-		 * We couldn't allocate an empty magazine,
-		 * so fall through to the slab layer.
+		 * If the magazine layer is disabled, break out now.
 		 */
-		break;
+
+		if (ccp->cc_magsize == 0) {
+			break;
+		}
+
+		if (!kmem_cpucache_magazine_alloc(ccp, cp))
+			break;
 	}
 	mutex_exit(&ccp->cc_lock);
-
-	/*
-	 * We couldn't free our constructed object to the magazine layer,
-	 * so apply its destructor and free it to the slab layer.
-	 */
-	kmem_slab_free_constructed(cp, buf, B_TRUE);
+	if (nbufs != 0) {
+		ASSERT(head != NULL);
+
+		/*
+		 * If there was a failure, return remaining objects to
+		 * the slab
+		 */
+		while (head != NULL) {
+			ASSERT(nbufs != 0);
+			next = head->bc_next;
+			head->bc_next = NULL;
+			kmem_slab_free(cp, KMEM_BUF(cp, head));
+			head = next;
+			nbufs--;
+		}
+	}
+	ASSERT(head == NULL);
+	ASSERT(nbufs == 0);
+	mutex_enter(&cp->cache_lock);
 }
 
 void *
@@ -3667,6 +3797,9 @@
 	if (cflags & KMC_NOTOUCH)
 		cp->cache_flags &= ~KMF_TOUCH;
 
+	if (cflags & KMC_PREFILL)
+		cp->cache_flags |= KMF_PREFILL;
+
 	if (cflags & KMC_NOHASH)
 		cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL);
 
@@ -3779,6 +3912,17 @@
 	cp->cache_maxchunks = (cp->cache_slabsize / cp->cache_chunksize);
 	cp->cache_partial_binshift = highbit(cp->cache_maxchunks / 16) + 1;
 
+	/*
+	 * Disallowing prefill when either the DEBUG or HASH flag is set or when
+	 * there is a constructor avoids some tricky issues with debug setup
+	 * that may be revisited later. We cannot allow prefill in a
+	 * metadata cache because of potential recursion.
+	 */
+	if (vmp == kmem_msb_arena ||
+	    cp->cache_flags & (KMF_HASH | KMF_BUFTAG) ||
+	    cp->cache_constructor != NULL)
+		cp->cache_flags &= ~KMF_PREFILL;
+
 	if (cp->cache_flags & KMF_HASH) {
 		ASSERT(!(cflags & KMC_NOHASH));
 		cp->cache_bufctl_cache = (cp->cache_flags & KMF_AUDIT) ?
@@ -4873,7 +5017,8 @@
 		return (B_TRUE);
 	}
 
-	to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs));
+	to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs),
+	    B_FALSE);
 	callback->kmm_to_buf = to_buf;
 	avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index);
 
--- a/usr/src/uts/common/sys/kmem.h	Thu Apr 08 03:07:32 2010 -0700
+++ b/usr/src/uts/common/sys/kmem.h	Thu Apr 08 07:11:19 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -78,6 +77,7 @@
 #define	KMC_QCACHE	0x00100000
 #define	KMC_KMEM_ALLOC	0x00200000	/* internal use only */
 #define	KMC_IDENTIFIER	0x00400000	/* internal use only */
+#define	KMC_PREFILL	0x00800000
 
 struct kmem_cache;		/* cache structure is opaque to kmem clients */
 
--- a/usr/src/uts/common/sys/kmem_impl.h	Thu Apr 08 03:07:32 2010 -0700
+++ b/usr/src/uts/common/sys/kmem_impl.h	Thu Apr 08 07:11:19 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_KMEM_IMPL_H
@@ -69,6 +68,7 @@
 
 #define	KMF_DUMPDIVERT	0x00001000	/* use alternate memory at dump time */
 #define	KMF_DUMPUNSAFE	0x00002000	/* flag caches used at dump time */
+#define	KMF_PREFILL	0x00004000	/* Prefill the slab when created. */
 
 #define	KMF_BUFTAG	(KMF_DEADBEEF | KMF_REDZONE)
 #define	KMF_TOUCH	(KMF_BUFTAG | KMF_LITE | KMF_CONTENTS)
--- a/usr/src/uts/common/vm/vm_anon.c	Thu Apr 08 03:07:32 2010 -0700
+++ b/usr/src/uts/common/vm/vm_anon.c	Thu Apr 08 07:11:19 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -204,7 +203,7 @@
 	anon_hash = (struct anon **)
 	    kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP);
 	anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon),
-	    AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
+	    AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL);
 	anonmap_cache = kmem_cache_create("anonmap_cache",
 	    sizeof (struct anon_map), 0,
 	    anonmap_cache_constructor, anonmap_cache_destructor, NULL,