# HG changeset patch # User David Valin # Date 1270735879 25200 # Node ID fa0c0f5bf466f9a91bf8aef70634e89c1867d99a # Parent d8c71dc8ec0dd9c9b493d095684cd0411448c2be 6538954 kmem_cache_alloc() doesn't scale for anon structure allocations when cache magazines are empty diff -r d8c71dc8ec0d -r fa0c0f5bf466 usr/src/uts/common/os/kmem.c --- a/usr/src/uts/common/os/kmem.c Thu Apr 08 03:07:32 2010 -0700 +++ b/usr/src/uts/common/os/kmem.c Thu Apr 08 07:11:19 2010 -0700 @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -1133,6 +1132,7 @@ static void kmem_cache_scan(kmem_cache_t *); static void kmem_cache_defrag(kmem_cache_t *); +static void kmem_slab_prefill(kmem_cache_t *, kmem_slab_t *); kmem_log_header_t *kmem_transaction_log; @@ -1654,18 +1654,19 @@ } static void * -kmem_slab_alloc_impl(kmem_cache_t *cp, kmem_slab_t *sp) +kmem_slab_alloc_impl(kmem_cache_t *cp, kmem_slab_t *sp, boolean_t prefill) { kmem_bufctl_t *bcp, **hash_bucket; void *buf; + boolean_t new_slab = (sp->slab_refcnt == 0); ASSERT(MUTEX_HELD(&cp->cache_lock)); /* * kmem_slab_alloc() drops cache_lock when it creates a new slab, so we * can't ASSERT(avl_is_empty(&cp->cache_partial_slabs)) here when the - * slab is newly created (sp->slab_refcnt == 0). + * slab is newly created. */ - ASSERT((sp->slab_refcnt == 0) || (KMEM_SLAB_IS_PARTIAL(sp) && + ASSERT(new_slab || (KMEM_SLAB_IS_PARTIAL(sp) && (sp == avl_first(&cp->cache_partial_slabs)))); ASSERT(sp->slab_cache == cp); @@ -1674,31 +1675,7 @@ sp->slab_refcnt++; bcp = sp->slab_head; - if ((sp->slab_head = bcp->bc_next) == NULL) { - ASSERT(KMEM_SLAB_IS_ALL_USED(sp)); - if (sp->slab_refcnt == 1) { - ASSERT(sp->slab_chunks == 1); - } else { - ASSERT(sp->slab_chunks > 1); /* the slab was partial */ - avl_remove(&cp->cache_partial_slabs, sp); - sp->slab_later_count = 0; /* clear history */ - sp->slab_flags &= ~KMEM_SLAB_NOMOVE; - sp->slab_stuck_offset = (uint32_t)-1; - } - list_insert_head(&cp->cache_complete_slabs, sp); - cp->cache_complete_slab_count++; - } else { - ASSERT(KMEM_SLAB_IS_PARTIAL(sp)); - if (sp->slab_refcnt == 1) { - avl_add(&cp->cache_partial_slabs, sp); - } else { - /* - * The slab is now more allocated than it was, so the - * order remains unchanged. - */ - ASSERT(!avl_update(&cp->cache_partial_slabs, sp)); - } - } + sp->slab_head = bcp->bc_next; if (cp->cache_flags & KMF_HASH) { /* @@ -1716,6 +1693,45 @@ } ASSERT(KMEM_SLAB_MEMBER(sp, buf)); + + if (sp->slab_head == NULL) { + ASSERT(KMEM_SLAB_IS_ALL_USED(sp)); + if (new_slab) { + ASSERT(sp->slab_chunks == 1); + } else { + ASSERT(sp->slab_chunks > 1); /* the slab was partial */ + avl_remove(&cp->cache_partial_slabs, sp); + sp->slab_later_count = 0; /* clear history */ + sp->slab_flags &= ~KMEM_SLAB_NOMOVE; + sp->slab_stuck_offset = (uint32_t)-1; + } + list_insert_head(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count++; + return (buf); + } + + ASSERT(KMEM_SLAB_IS_PARTIAL(sp)); + /* + * Peek to see if the magazine layer is enabled before + * we prefill. We're not holding the cpu cache lock, + * so the peek could be wrong, but there's no harm in it. + */ + if (new_slab && prefill && (cp->cache_flags & KMF_PREFILL) && + (KMEM_CPU_CACHE(cp)->cc_magsize != 0)) { + kmem_slab_prefill(cp, sp); + return (buf); + } + + if (new_slab) { + avl_add(&cp->cache_partial_slabs, sp); + return (buf); + } + + /* + * The slab is now more allocated than it was, so the + * order remains unchanged. + */ + ASSERT(!avl_update(&cp->cache_partial_slabs, sp)); return (buf); } @@ -1749,7 +1765,7 @@ cp->cache_bufslab += sp->slab_chunks; } - buf = kmem_slab_alloc_impl(cp, sp); + buf = kmem_slab_alloc_impl(cp, sp, B_TRUE); ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) == (cp->cache_complete_slab_count + avl_numnodes(&cp->cache_partial_slabs) + @@ -2627,14 +2643,79 @@ } /* + * Used when there's no room to free a buffer to the per-CPU cache. + * Drops and re-acquires &ccp->cc_lock, and returns non-zero if the + * caller should try freeing to the per-CPU cache again. + * Note that we don't directly install the magazine in the cpu cache, + * since its state may have changed wildly while the lock was dropped. + */ +static int +kmem_cpucache_magazine_alloc(kmem_cpu_cache_t *ccp, kmem_cache_t *cp) +{ + kmem_magazine_t *emp; + kmem_magtype_t *mtp; + + ASSERT(MUTEX_HELD(&ccp->cc_lock)); + ASSERT(((uint_t)ccp->cc_rounds == ccp->cc_magsize || + ((uint_t)ccp->cc_rounds == -1)) && + ((uint_t)ccp->cc_prounds == ccp->cc_magsize || + ((uint_t)ccp->cc_prounds == -1))); + + emp = kmem_depot_alloc(cp, &cp->cache_empty); + if (emp != NULL) { + if (ccp->cc_ploaded != NULL) + kmem_depot_free(cp, &cp->cache_full, + ccp->cc_ploaded); + kmem_cpu_reload(ccp, emp, 0); + return (1); + } + /* + * There are no empty magazines in the depot, + * so try to allocate a new one. We must drop all locks + * across kmem_cache_alloc() because lower layers may + * attempt to allocate from this cache. + */ + mtp = cp->cache_magtype; + mutex_exit(&ccp->cc_lock); + emp = kmem_cache_alloc(mtp->mt_cache, KM_NOSLEEP); + mutex_enter(&ccp->cc_lock); + + if (emp != NULL) { + /* + * We successfully allocated an empty magazine. + * However, we had to drop ccp->cc_lock to do it, + * so the cache's magazine size may have changed. + * If so, free the magazine and try again. + */ + if (ccp->cc_magsize != mtp->mt_magsize) { + mutex_exit(&ccp->cc_lock); + kmem_cache_free(mtp->mt_cache, emp); + mutex_enter(&ccp->cc_lock); + return (1); + } + + /* + * We got a magazine of the right size. Add it to + * the depot and try the whole dance again. + */ + kmem_depot_free(cp, &cp->cache_empty, emp); + return (1); + } + + /* + * We couldn't allocate an empty magazine, + * so fall through to the slab layer. + */ + return (0); +} + +/* * Free a constructed object to cache cp. */ void kmem_cache_free(kmem_cache_t *cp, void *buf) { kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); - kmem_magazine_t *emp; - kmem_magtype_t *mtp; /* * The client must not free either of the buffers passed to the move @@ -2660,6 +2741,9 @@ } mutex_enter(&ccp->cc_lock); + /* + * Any changes to this logic should be reflected in kmem_slab_prefill() + */ for (;;) { /* * If there's a slot available in the current CPU's @@ -2687,64 +2771,110 @@ if (ccp->cc_magsize == 0) break; + if (!kmem_cpucache_magazine_alloc(ccp, cp)) { + /* + * We couldn't free our constructed object to the + * magazine layer, so apply its destructor and free it + * to the slab layer. + */ + break; + } + } + mutex_exit(&ccp->cc_lock); + kmem_slab_free_constructed(cp, buf, B_TRUE); +} + +static void +kmem_slab_prefill(kmem_cache_t *cp, kmem_slab_t *sp) +{ + kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp); + int cache_flags = cp->cache_flags; + + kmem_bufctl_t *next, *head; + size_t nbufs; + + /* + * Completely allocate the newly created slab and put the pre-allocated + * buffers in magazines. Any of the buffers that cannot be put in + * magazines must be returned to the slab. + */ + ASSERT(MUTEX_HELD(&cp->cache_lock)); + ASSERT((cache_flags & (KMF_PREFILL|KMF_BUFTAG)) == KMF_PREFILL); + ASSERT(cp->cache_constructor == NULL); + ASSERT(sp->slab_cache == cp); + ASSERT(sp->slab_refcnt == 1); + ASSERT(sp->slab_head != NULL && sp->slab_chunks > sp->slab_refcnt); + ASSERT(avl_find(&cp->cache_partial_slabs, sp, NULL) == NULL); + + head = sp->slab_head; + nbufs = (sp->slab_chunks - sp->slab_refcnt); + sp->slab_head = NULL; + sp->slab_refcnt += nbufs; + cp->cache_bufslab -= nbufs; + cp->cache_slab_alloc += nbufs; + list_insert_head(&cp->cache_complete_slabs, sp); + cp->cache_complete_slab_count++; + mutex_exit(&cp->cache_lock); + mutex_enter(&ccp->cc_lock); + + while (head != NULL) { + void *buf = KMEM_BUF(cp, head); /* - * Try to get an empty magazine from the depot. + * If there's a slot available in the current CPU's + * loaded magazine, just put the object there and + * continue. */ - emp = kmem_depot_alloc(cp, &cp->cache_empty); - if (emp != NULL) { - if (ccp->cc_ploaded != NULL) - kmem_depot_free(cp, &cp->cache_full, - ccp->cc_ploaded); - kmem_cpu_reload(ccp, emp, 0); + if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) { + ccp->cc_loaded->mag_round[ccp->cc_rounds++] = + buf; + ccp->cc_free++; + nbufs--; + head = head->bc_next; continue; } /* - * There are no empty magazines in the depot, - * so try to allocate a new one. We must drop all locks - * across kmem_cache_alloc() because lower layers may - * attempt to allocate from this cache. + * The loaded magazine is full. If the previously + * loaded magazine was empty, exchange them and try + * again. */ - mtp = cp->cache_magtype; - mutex_exit(&ccp->cc_lock); - emp = kmem_cache_alloc(mtp->mt_cache, KM_NOSLEEP); - mutex_enter(&ccp->cc_lock); - - if (emp != NULL) { - /* - * We successfully allocated an empty magazine. - * However, we had to drop ccp->cc_lock to do it, - * so the cache's magazine size may have changed. - * If so, free the magazine and try again. - */ - if (ccp->cc_magsize != mtp->mt_magsize) { - mutex_exit(&ccp->cc_lock); - kmem_cache_free(mtp->mt_cache, emp); - mutex_enter(&ccp->cc_lock); - continue; - } - - /* - * We got a magazine of the right size. Add it to - * the depot and try the whole dance again. - */ - kmem_depot_free(cp, &cp->cache_empty, emp); + if (ccp->cc_prounds == 0) { + kmem_cpu_reload(ccp, ccp->cc_ploaded, + ccp->cc_prounds); continue; } /* - * We couldn't allocate an empty magazine, - * so fall through to the slab layer. + * If the magazine layer is disabled, break out now. */ - break; + + if (ccp->cc_magsize == 0) { + break; + } + + if (!kmem_cpucache_magazine_alloc(ccp, cp)) + break; } mutex_exit(&ccp->cc_lock); - - /* - * We couldn't free our constructed object to the magazine layer, - * so apply its destructor and free it to the slab layer. - */ - kmem_slab_free_constructed(cp, buf, B_TRUE); + if (nbufs != 0) { + ASSERT(head != NULL); + + /* + * If there was a failure, return remaining objects to + * the slab + */ + while (head != NULL) { + ASSERT(nbufs != 0); + next = head->bc_next; + head->bc_next = NULL; + kmem_slab_free(cp, KMEM_BUF(cp, head)); + head = next; + nbufs--; + } + } + ASSERT(head == NULL); + ASSERT(nbufs == 0); + mutex_enter(&cp->cache_lock); } void * @@ -3667,6 +3797,9 @@ if (cflags & KMC_NOTOUCH) cp->cache_flags &= ~KMF_TOUCH; + if (cflags & KMC_PREFILL) + cp->cache_flags |= KMF_PREFILL; + if (cflags & KMC_NOHASH) cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL); @@ -3779,6 +3912,17 @@ cp->cache_maxchunks = (cp->cache_slabsize / cp->cache_chunksize); cp->cache_partial_binshift = highbit(cp->cache_maxchunks / 16) + 1; + /* + * Disallowing prefill when either the DEBUG or HASH flag is set or when + * there is a constructor avoids some tricky issues with debug setup + * that may be revisited later. We cannot allow prefill in a + * metadata cache because of potential recursion. + */ + if (vmp == kmem_msb_arena || + cp->cache_flags & (KMF_HASH | KMF_BUFTAG) || + cp->cache_constructor != NULL) + cp->cache_flags &= ~KMF_PREFILL; + if (cp->cache_flags & KMF_HASH) { ASSERT(!(cflags & KMC_NOHASH)); cp->cache_bufctl_cache = (cp->cache_flags & KMF_AUDIT) ? @@ -4873,7 +5017,8 @@ return (B_TRUE); } - to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs)); + to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs), + B_FALSE); callback->kmm_to_buf = to_buf; avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index); diff -r d8c71dc8ec0d -r fa0c0f5bf466 usr/src/uts/common/sys/kmem.h --- a/usr/src/uts/common/sys/kmem.h Thu Apr 08 03:07:32 2010 -0700 +++ b/usr/src/uts/common/sys/kmem.h Thu Apr 08 07:11:19 2010 -0700 @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -78,6 +77,7 @@ #define KMC_QCACHE 0x00100000 #define KMC_KMEM_ALLOC 0x00200000 /* internal use only */ #define KMC_IDENTIFIER 0x00400000 /* internal use only */ +#define KMC_PREFILL 0x00800000 struct kmem_cache; /* cache structure is opaque to kmem clients */ diff -r d8c71dc8ec0d -r fa0c0f5bf466 usr/src/uts/common/sys/kmem_impl.h --- a/usr/src/uts/common/sys/kmem_impl.h Thu Apr 08 03:07:32 2010 -0700 +++ b/usr/src/uts/common/sys/kmem_impl.h Thu Apr 08 07:11:19 2010 -0700 @@ -20,8 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_KMEM_IMPL_H @@ -69,6 +68,7 @@ #define KMF_DUMPDIVERT 0x00001000 /* use alternate memory at dump time */ #define KMF_DUMPUNSAFE 0x00002000 /* flag caches used at dump time */ +#define KMF_PREFILL 0x00004000 /* Prefill the slab when created. */ #define KMF_BUFTAG (KMF_DEADBEEF | KMF_REDZONE) #define KMF_TOUCH (KMF_BUFTAG | KMF_LITE | KMF_CONTENTS) diff -r d8c71dc8ec0d -r fa0c0f5bf466 usr/src/uts/common/vm/vm_anon.c --- a/usr/src/uts/common/vm/vm_anon.c Thu Apr 08 03:07:32 2010 -0700 +++ b/usr/src/uts/common/vm/vm_anon.c Thu Apr 08 07:11:19 2010 -0700 @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -204,7 +203,7 @@ anon_hash = (struct anon **) kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), - AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); + AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL); anonmap_cache = kmem_cache_create("anonmap_cache", sizeof (struct anon_map), 0, anonmap_cache_constructor, anonmap_cache_destructor, NULL,