--- a/usr/src/uts/common/vm/seg_vn.c Fri Jul 21 17:36:06 2006 -0700
+++ b/usr/src/uts/common/vm/seg_vn.c Fri Jul 21 21:13:27 2006 -0700
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -176,7 +175,7 @@
caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
u_offset_t, struct vpage *, page_t **, uint_t,
- enum fault_type, enum seg_rw, int);
+ enum fault_type, enum seg_rw, int, int);
static void segvn_vpage(struct seg *);
static void segvn_purge(struct seg *seg);
@@ -185,12 +184,15 @@
static int sameprot(struct seg *, caddr_t, size_t);
-static int segvn_demote_range(struct seg *, caddr_t, size_t, int);
+static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t);
static int segvn_clrszc(struct seg *);
static struct seg *segvn_split_seg(struct seg *, caddr_t);
static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t,
ulong_t, uint_t);
+static int segvn_pp_lock_anonpages(page_t *, int);
+static void segvn_pp_unlock_anonpages(page_t *, int);
+
static struct kmem_cache *segvn_cache;
#ifdef VM_STATS
@@ -272,6 +274,7 @@
ulong_t segvn_fltvnpages_clrszc_cnt;
ulong_t segvn_fltvnpages_clrszc_err;
ulong_t segvn_setpgsz_align_err;
+ulong_t segvn_setpgsz_anon_align_err;
ulong_t segvn_setpgsz_getattr_err;
ulong_t segvn_setpgsz_eof_err;
ulong_t segvn_faultvnmpss_align_err1;
@@ -388,8 +391,8 @@
a->flags &= ~MAP_NORESERVE;
if (a->szc != 0) {
- if (segvn_lpg_disable != 0 || a->amp != NULL ||
- (a->type == MAP_SHARED && a->vp == NULL) ||
+ if (segvn_lpg_disable != 0 ||
+ (a->amp != NULL && a->type == MAP_PRIVATE) ||
(a->flags & MAP_NORESERVE) || seg->s_as == &kas) {
a->szc = 0;
} else {
@@ -412,6 +415,12 @@
a->offset & PAGEMASK)) {
a->szc = 0;
}
+ } else if (a->amp != NULL) {
+ pgcnt_t anum = btopr(a->offset);
+ pgcnt_t pgcnt = page_get_pagecnt(a->szc);
+ if (!IS_P2ALIGNED(anum, pgcnt)) {
+ a->szc = 0;
+ }
}
}
}
@@ -657,6 +666,9 @@
*/
ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
amp->refcnt++;
+ if (a->szc > amp->a_szc) {
+ amp->a_szc = a->szc;
+ }
ANON_LOCK_EXIT(&->a_rwlock);
svd->anon_index = anon_num;
svd->swresv = 0;
@@ -754,10 +766,10 @@
* Concatenate two existing segments, if possible.
* Return 0 on success, -1 if two segments are not compatible
* or -2 on memory allocation failure.
- * If private == 1 then try and concat segments with private pages.
+ * If amp_cat == 1 then try and concat segments with anon maps
*/
static int
-segvn_concat(struct seg *seg1, struct seg *seg2, int private)
+segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat)
{
struct segvn_data *svd1 = seg1->s_data;
struct segvn_data *svd2 = seg2->s_data;
@@ -793,13 +805,23 @@
/*
* Fail early if we're not supposed to concatenate
- * private pages.
+ * segments with non NULL amp.
*/
- if ((private == 0 || svd1->type != MAP_PRIVATE) &&
- (amp1 != NULL || amp2 != NULL)) {
+ if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) {
return (-1);
}
+ if (svd1->vp == NULL && svd1->type == MAP_SHARED) {
+ if (amp1 != amp2) {
+ return (-1);
+ }
+ if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) !=
+ svd2->anon_index) {
+ return (-1);
+ }
+ ASSERT(amp1 == NULL || amp1->refcnt >= 2);
+ }
+
/*
* If either seg has vpages, create a new merged vpage array.
*/
@@ -840,13 +862,24 @@
/*
* If either segment has private pages, create a new merged anon
- * array.
+ * array. If mergeing shared anon segments just decrement anon map's
+ * refcnt.
*/
- if (amp1 != NULL || amp2 != NULL) {
+ if (amp1 != NULL && svd1->type == MAP_SHARED) {
+ ASSERT(amp1 == amp2 && svd1->vp == NULL);
+ ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER);
+ ASSERT(amp1->refcnt >= 2);
+ amp1->refcnt--;
+ ANON_LOCK_EXIT(&1->a_rwlock);
+ svd2->amp = NULL;
+ } else if (amp1 != NULL || amp2 != NULL) {
struct anon_hdr *nahp;
struct anon_map *namp = NULL;
- size_t asize = seg1->s_size + seg2->s_size;
-
+ size_t asize;
+
+ ASSERT(svd1->type == MAP_PRIVATE);
+
+ asize = seg1->s_size + seg2->s_size;
if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) {
if (nvpage != NULL) {
kmem_free(nvpage, nvpsize);
@@ -1442,7 +1475,7 @@
if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) {
ASSERT(seg->s_base != addr || seg->s_size != len);
VM_STAT_ADD(segvnvmstats.demoterange[0]);
- err = segvn_demote_range(seg, addr, len, SDR_END);
+ err = segvn_demote_range(seg, addr, len, SDR_END, 0);
if (err == 0) {
return (IE_RETRY);
}
@@ -1490,6 +1523,7 @@
dpages = btop(len);
npages = opages - dpages;
amp = svd->amp;
+ ASSERT(amp == NULL || amp->a_szc >= seg->s_szc);
/*
* Check for beginning of segment
@@ -1514,17 +1548,27 @@
/*
* Free up now unused parts of anon_map array.
*/
- if (seg->s_szc != 0) {
- anon_free_pages(amp->ahp,
- svd->anon_index, len, seg->s_szc);
+ if (amp->a_szc == seg->s_szc) {
+ if (seg->s_szc != 0) {
+ anon_free_pages(amp->ahp,
+ svd->anon_index, len,
+ seg->s_szc);
+ } else {
+ anon_free(amp->ahp,
+ svd->anon_index,
+ len);
+ }
} else {
- anon_free(amp->ahp, svd->anon_index,
- len);
+ ASSERT(svd->type == MAP_SHARED);
+ ASSERT(amp->a_szc > seg->s_szc);
+ anon_shmap_free_pages(amp,
+ svd->anon_index, len);
}
/*
- * Unreserve swap space for the unmapped chunk
- * of this segment in case it's MAP_SHARED
+ * Unreserve swap space for the
+ * unmapped chunk of this segment in
+ * case it's MAP_SHARED
*/
if (svd->type == MAP_SHARED) {
anon_unresv(len);
@@ -1580,20 +1624,29 @@
ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
/*
- * Free up now unused parts of anon_map array
+ * Free up now unused parts of anon_map array.
*/
- if (seg->s_szc != 0) {
- ulong_t an_idx = svd->anon_index +
- npages;
- anon_free_pages(amp->ahp, an_idx,
- len, seg->s_szc);
+ ulong_t an_idx = svd->anon_index + npages;
+ if (amp->a_szc == seg->s_szc) {
+ if (seg->s_szc != 0) {
+ anon_free_pages(amp->ahp,
+ an_idx, len,
+ seg->s_szc);
+ } else {
+ anon_free(amp->ahp, an_idx,
+ len);
+ }
} else {
- anon_free(amp->ahp,
- svd->anon_index + npages, len);
+ ASSERT(svd->type == MAP_SHARED);
+ ASSERT(amp->a_szc > seg->s_szc);
+ anon_shmap_free_pages(amp,
+ an_idx, len);
}
+
/*
- * Unreserve swap space for the unmapped chunk
- * of this segment in case it's MAP_SHARED
+ * Unreserve swap space for the
+ * unmapped chunk of this segment in
+ * case it's MAP_SHARED
*/
if (svd->type == MAP_SHARED) {
anon_unresv(len);
@@ -1689,31 +1742,36 @@
ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) {
/*
- * Free up now unused parts of anon_map array
+ * Free up now unused parts of anon_map array.
*/
- if (seg->s_szc != 0) {
- ulong_t an_idx = svd->anon_index + opages;
- anon_free_pages(amp->ahp, an_idx, len,
- seg->s_szc);
+ ulong_t an_idx = svd->anon_index + opages;
+ if (amp->a_szc == seg->s_szc) {
+ if (seg->s_szc != 0) {
+ anon_free_pages(amp->ahp, an_idx, len,
+ seg->s_szc);
+ } else {
+ anon_free(amp->ahp, an_idx,
+ len);
+ }
} else {
- anon_free(amp->ahp, svd->anon_index + opages,
- len);
+ ASSERT(svd->type == MAP_SHARED);
+ ASSERT(amp->a_szc > seg->s_szc);
+ anon_shmap_free_pages(amp, an_idx, len);
}
/*
- * Unreserve swap space for the unmapped chunk
- * of this segment in case it's MAP_SHARED
+ * Unreserve swap space for the
+ * unmapped chunk of this segment in
+ * case it's MAP_SHARED
*/
if (svd->type == MAP_SHARED) {
anon_unresv(len);
amp->swresv -= len;
}
}
-
nsvd->anon_index = svd->anon_index +
btop((uintptr_t)(nseg->s_base - seg->s_base));
if (svd->type == MAP_SHARED) {
- ASSERT(seg->s_szc == 0);
amp->refcnt++;
nsvd->amp = amp;
} else {
@@ -1799,6 +1857,7 @@
* up all the anon slot pointers that we can.
*/
ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
+ ASSERT(amp->a_szc >= seg->s_szc);
if (--amp->refcnt == 0) {
if (svd->type == MAP_PRIVATE) {
/*
@@ -1819,8 +1878,12 @@
* anon_map's worth of stuff and
* release any swap reservation.
*/
- ASSERT(seg->s_szc == 0);
- anon_free(amp->ahp, 0, amp->size);
+ if (amp->a_szc != 0) {
+ anon_shmap_free_pages(amp, 0,
+ amp->size);
+ } else {
+ anon_free(amp->ahp, 0, amp->size);
+ }
if ((len = amp->swresv) != 0) {
anon_unresv(len);
TRACE_3(TR_FAC_VM, TR_ANON_PROC,
@@ -1876,6 +1939,140 @@
kmem_cache_free(segvn_cache, svd);
}
+ulong_t segvn_lpglck_limit = 0;
+/*
+ * Support routines used by segvn_pagelock() and softlock faults for anonymous
+ * pages to implement availrmem accounting in a way that makes sure the
+ * same memory is accounted just once for all softlock/pagelock purposes.
+ * This prevents a bug when availrmem is quickly incorrectly exausted from
+ * several pagelocks to different parts of the same large page since each
+ * pagelock has to decrement availrmem by the size of the entire large
+ * page. Note those pages are not COW shared until softunlock/pageunlock so
+ * we don't need to use cow style accounting here. We also need to make sure
+ * the entire large page is accounted even if softlock range is less than the
+ * entire large page because large anon pages can't be demoted when any of
+ * constituent pages is locked. The caller calls this routine for every page_t
+ * it locks. The very first page in the range may not be the root page of a
+ * large page. For all other pages it's guranteed we are going to visit the
+ * root of a particular large page before any other constituent page as we are
+ * locking sequential pages belonging to the same anon map. So we do all the
+ * locking when the root is encountered except for the very first page. Since
+ * softlocking is not supported (except S_READ_NOCOW special case) for vmpss
+ * segments and since vnode pages can be demoted without locking all
+ * constituent pages vnode pages don't come here. Unlocking relies on the
+ * fact that pagesize can't change whenever any of constituent large pages is
+ * locked at least SE_SHARED. This allows unlocking code to find the right
+ * root and decrement availrmem by the same amount it was incremented when the
+ * page was locked.
+ */
+static int
+segvn_pp_lock_anonpages(page_t *pp, int first)
+{
+ pgcnt_t pages;
+ pfn_t pfn;
+ uchar_t szc = pp->p_szc;
+
+ ASSERT(PAGE_LOCKED(pp));
+ ASSERT(pp->p_vnode != NULL);
+ ASSERT(IS_SWAPFSVP(pp->p_vnode));
+
+ /*
+ * pagesize won't change as long as any constituent page is locked.
+ */
+ pages = page_get_pagecnt(pp->p_szc);
+ pfn = page_pptonum(pp);
+
+ if (!first) {
+ if (!IS_P2ALIGNED(pfn, pages)) {
+#ifdef DEBUG
+ pp = &pp[-(spgcnt_t)(pfn & (pages - 1))];
+ pfn = page_pptonum(pp);
+ ASSERT(IS_P2ALIGNED(pfn, pages));
+ ASSERT(pp->p_szc == szc);
+ ASSERT(pp->p_vnode != NULL);
+ ASSERT(IS_SWAPFSVP(pp->p_vnode));
+ ASSERT(pp->p_slckcnt != 0);
+#endif /* DEBUG */
+ return (1);
+ }
+ } else if (!IS_P2ALIGNED(pfn, pages)) {
+ pp = &pp[-(spgcnt_t)(pfn & (pages - 1))];
+#ifdef DEBUG
+ pfn = page_pptonum(pp);
+ ASSERT(IS_P2ALIGNED(pfn, pages));
+ ASSERT(pp->p_szc == szc);
+ ASSERT(pp->p_vnode != NULL);
+ ASSERT(IS_SWAPFSVP(pp->p_vnode));
+#endif /* DEBUG */
+ }
+
+ /*
+ * pp is a root page.
+ * We haven't locked this large page yet.
+ */
+ page_struct_lock(pp);
+ if (pp->p_slckcnt != 0) {
+ if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) {
+ pp->p_slckcnt++;
+ page_struct_unlock(pp);
+ return (1);
+ }
+ page_struct_unlock(pp);
+ segvn_lpglck_limit++;
+ return (0);
+ }
+ mutex_enter(&freemem_lock);
+ if (availrmem < tune.t_minarmem + pages) {
+ mutex_exit(&freemem_lock);
+ page_struct_unlock(pp);
+ return (0);
+ }
+ pp->p_slckcnt++;
+ availrmem -= pages;
+ mutex_exit(&freemem_lock);
+ page_struct_unlock(pp);
+ return (1);
+}
+
+static void
+segvn_pp_unlock_anonpages(page_t *pp, int first)
+{
+ pgcnt_t pages;
+ pfn_t pfn;
+
+ ASSERT(PAGE_LOCKED(pp));
+ ASSERT(pp->p_vnode != NULL);
+ ASSERT(IS_SWAPFSVP(pp->p_vnode));
+
+ /*
+ * pagesize won't change as long as any constituent page is locked.
+ */
+ pages = page_get_pagecnt(pp->p_szc);
+ pfn = page_pptonum(pp);
+
+ if (!first) {
+ if (!IS_P2ALIGNED(pfn, pages)) {
+ return;
+ }
+ } else if (!IS_P2ALIGNED(pfn, pages)) {
+ pp = &pp[-(spgcnt_t)(pfn & (pages - 1))];
+#ifdef DEBUG
+ pfn = page_pptonum(pp);
+ ASSERT(IS_P2ALIGNED(pfn, pages));
+#endif /* DEBUG */
+ }
+ ASSERT(pp->p_vnode != NULL);
+ ASSERT(IS_SWAPFSVP(pp->p_vnode));
+ ASSERT(pp->p_slckcnt != 0);
+ page_struct_lock(pp);
+ if (--pp->p_slckcnt == 0) {
+ mutex_enter(&freemem_lock);
+ availrmem += pages;
+ mutex_exit(&freemem_lock);
+ }
+ page_struct_unlock(pp);
+}
+
/*
* Do a F_SOFTUNLOCK call over the range requested. The range must have
* already been F_SOFTLOCK'ed.
@@ -1943,10 +2140,15 @@
}
TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT,
"segvn_fault:pp %p vp %p offset %llx", pp, vp, offset);
+ if (svd->vp == NULL) {
+ segvn_pp_unlock_anonpages(pp, adr == addr);
+ }
page_unlock(pp);
}
mutex_enter(&freemem_lock); /* for availrmem */
- availrmem += btop(len);
+ if (svd->vp != NULL) {
+ availrmem += btop(len);
+ }
segvn_pages_locked -= btop(len);
svd->softlockcnt -= btop(len);
mutex_exit(&freemem_lock);
@@ -2028,7 +2230,8 @@
uint_t vpprot, /* access allowed to object pages */
enum fault_type type, /* type of fault */
enum seg_rw rw, /* type of access at fault */
- int brkcow) /* we may need to break cow */
+ int brkcow, /* we may need to break cow */
+ int first) /* first page for this fault if 1 */
{
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
page_t *pp, **ppp;
@@ -2084,14 +2287,14 @@
prot = svd->prot;
}
- if (type == F_SOFTLOCK) {
+ if (type == F_SOFTLOCK && svd->vp != NULL) {
mutex_enter(&freemem_lock);
if (availrmem <= tune.t_minarmem) {
mutex_exit(&freemem_lock);
return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */
} else {
+ availrmem--;
svd->softlockcnt++;
- availrmem--;
segvn_pages_locked++;
}
mutex_exit(&freemem_lock);
@@ -2134,6 +2337,21 @@
*/
(void) anon_set_ptr(amp->ahp, anon_index, ap,
ANON_SLEEP);
+
+ ASSERT(pp->p_szc == 0);
+ if (type == F_SOFTLOCK) {
+ if (!segvn_pp_lock_anonpages(pp, first)) {
+ page_unlock(pp);
+ err = ENOMEM;
+ goto out;
+ } else {
+ mutex_enter(&freemem_lock);
+ svd->softlockcnt++;
+ segvn_pages_locked++;
+ mutex_exit(&freemem_lock);
+ }
+ }
+
if (enable_mbit_wa) {
if (rw == S_WRITE)
hat_setmod(pp);
@@ -2263,6 +2481,23 @@
* and return.
*/
if (cow == 0) {
+ if (type == F_SOFTLOCK && svd->vp == NULL) {
+
+ ASSERT(opp->p_szc == 0 ||
+ (svd->type == MAP_SHARED &&
+ amp != NULL && amp->a_szc != 0));
+
+ if (!segvn_pp_lock_anonpages(opp, first)) {
+ page_unlock(opp);
+ err = ENOMEM;
+ goto out;
+ } else {
+ mutex_enter(&freemem_lock);
+ svd->softlockcnt++;
+ segvn_pages_locked++;
+ mutex_exit(&freemem_lock);
+ }
+ }
if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) {
if (rw == S_WRITE)
hat_setmod(opp);
@@ -2380,6 +2615,20 @@
(void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
+ ASSERT(pp->p_szc == 0);
+ if (type == F_SOFTLOCK && svd->vp == NULL) {
+ if (!segvn_pp_lock_anonpages(pp, first)) {
+ page_unlock(pp);
+ err = ENOMEM;
+ goto out;
+ } else {
+ mutex_enter(&freemem_lock);
+ svd->softlockcnt++;
+ segvn_pages_locked++;
+ mutex_exit(&freemem_lock);
+ }
+ }
+
ASSERT(!IS_VMODSORT(pp->p_vnode));
if (enable_mbit_wa) {
if (rw == S_WRITE)
@@ -2406,7 +2655,7 @@
if (anon_lock)
anon_array_exit(&cookie);
- if (type == F_SOFTLOCK) {
+ if (type == F_SOFTLOCK && svd->vp != NULL) {
mutex_enter(&freemem_lock);
availrmem++;
segvn_pages_locked--;
@@ -3660,9 +3909,17 @@
}
SEGVN_UPDATE_MODBITS(ppa, pages, rw,
prot, vpprot);
- for (i = 0; i < pages; i++) {
- hat_memload(hat, a + (i << PAGESHIFT),
- ppa[i], prot & vpprot, hat_flag);
+ if (upgrdfail && segvn_anypgsz_vnode) {
+ /* SOFTLOCK case */
+ hat_memload_array(hat, a, pgsz,
+ ppa, prot & vpprot, hat_flag);
+ } else {
+ for (i = 0; i < pages; i++) {
+ hat_memload(hat,
+ a + (i << PAGESHIFT),
+ ppa[i], prot & vpprot,
+ hat_flag);
+ }
}
if (!(hat_flag & HAT_LOAD_LOCK)) {
for (i = 0; i < pages; i++) {
@@ -3942,16 +4199,18 @@
faultcode_t err;
int ierr;
uint_t protchk, prot, vpprot;
- int i;
+ ulong_t i;
int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD;
anon_sync_obj_t cookie;
+ int first = 1;
+ int adjszc_chk;
+ int purged = 0;
ASSERT(szc != 0);
ASSERT(amp != NULL);
ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */
ASSERT(!(svd->flags & MAP_NORESERVE));
ASSERT(type != F_SOFTUNLOCK);
- ASSERT(segtype == MAP_PRIVATE);
ASSERT(IS_P2ALIGNED(a, maxpgsz));
ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
@@ -3988,6 +4247,7 @@
ppa = kmem_alloc(ppasize, KM_SLEEP);
ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
for (;;) {
+ adjszc_chk = 0;
for (; a < lpgeaddr; a += pgsz, aindx += pages) {
if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) {
VM_STAT_ADD(segvnvmstats.fltanpages[3]);
@@ -3999,7 +4259,17 @@
goto error;
}
}
- if (type == F_SOFTLOCK) {
+ if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) &&
+ pgsz < maxpgsz) {
+ ASSERT(a > lpgaddr);
+ szc = seg->s_szc;
+ pgsz = maxpgsz;
+ pages = btop(pgsz);
+ ASSERT(IS_P2ALIGNED(aindx, pages));
+ lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr,
+ pgsz);
+ }
+ if (type == F_SOFTLOCK && svd->vp != NULL) {
mutex_enter(&freemem_lock);
if (availrmem < tune.t_minarmem + pages) {
mutex_exit(&freemem_lock);
@@ -4020,7 +4290,7 @@
if (ierr != 0) {
anon_array_exit(&cookie);
VM_STAT_ADD(segvnvmstats.fltanpages[4]);
- if (type == F_SOFTLOCK) {
+ if (type == F_SOFTLOCK && svd->vp != NULL) {
VM_STAT_ADD(segvnvmstats.fltanpages[5]);
mutex_enter(&freemem_lock);
availrmem += pages;
@@ -4038,12 +4308,41 @@
ASSERT(!IS_VMODSORT(ppa[0]->p_vnode));
+ ASSERT(segtype == MAP_SHARED ||
+ ppa[0]->p_szc <= szc);
+ ASSERT(segtype == MAP_PRIVATE ||
+ ppa[0]->p_szc >= szc);
+
+ if (type == F_SOFTLOCK && svd->vp == NULL) {
+ /*
+ * All pages in ppa array belong to the same
+ * large page. This means it's ok to call
+ * segvn_pp_lock_anonpages just for ppa[0].
+ */
+ if (!segvn_pp_lock_anonpages(ppa[0], first)) {
+ for (i = 0; i < pages; i++) {
+ page_unlock(ppa[i]);
+ }
+ err = FC_MAKE_ERR(ENOMEM);
+ goto error;
+ }
+ first = 0;
+ mutex_enter(&freemem_lock);
+ svd->softlockcnt += pages;
+ segvn_pages_locked += pages;
+ mutex_exit(&freemem_lock);
+ }
+
/*
* Handle pages that have been marked for migration
*/
if (lgrp_optimizations())
page_migrate(seg, a, ppa, pages);
+ if (segtype == MAP_SHARED) {
+ vpprot |= PROT_WRITE;
+ }
+
hat_memload_array(hat, a, pgsz, ppa,
prot & vpprot, hat_flag);
@@ -4058,6 +4357,7 @@
vpage += pages;
anon_array_exit(&cookie);
+ adjszc_chk = 1;
}
if (a == lpgeaddr)
break;
@@ -4078,6 +4378,18 @@
* have relocated locked pages.
*/
ASSERT(ierr == -1 || ierr == -2);
+ /*
+ * For the very first relocation failure try to purge this
+ * segment's cache so that the relocator can obtain an
+ * exclusive lock on pages we want to relocate.
+ */
+ if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 &&
+ svd->softlockcnt != 0) {
+ purged = 1;
+ segvn_purge(seg);
+ continue;
+ }
+
if (segvn_anypgsz) {
ASSERT(ierr == -2 || szc != 0);
ASSERT(ierr == -1 || szc < seg->s_szc);
@@ -4377,15 +4689,8 @@
if (seg->s_szc != 0) {
pgsz = page_get_pagesize(seg->s_szc);
ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock));
- /*
- * We may need to do relocations so purge seg_pcache to allow
- * pages to be locked exclusively.
- */
- if (svd->softlockcnt != 0)
- segvn_purge(seg);
CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
if (svd->vp == NULL) {
- ASSERT(svd->type == MAP_PRIVATE);
err = segvn_fault_anonpages(hat, seg, lpgaddr,
lpgeaddr, type, rw, addr, addr + len, brkcow);
} else {
@@ -4704,13 +5009,14 @@
*/
for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) {
err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot,
- type, rw, brkcow);
+ type, rw, brkcow, a == addr);
if (err) {
if (amp != NULL)
ANON_LOCK_EXIT(&->a_rwlock);
- if (type == F_SOFTLOCK && a > addr)
+ if (type == F_SOFTLOCK && a > addr) {
segvn_softunlock(seg, addr, (a - addr),
S_OTHER);
+ }
SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
segvn_pagelist_rele(plp);
if (pl_alloc_sz)
@@ -4938,7 +5244,15 @@
if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock))
return (IE_RETRY);
VM_STAT_ADD(segvnvmstats.demoterange[1]);
- err = segvn_demote_range(seg, addr, len, SDR_END);
+ if (svd->type == MAP_PRIVATE || svd->vp != NULL) {
+ err = segvn_demote_range(seg, addr, len,
+ SDR_END, 0);
+ } else {
+ uint_t szcvec = map_shm_pgszcvec(seg->s_base,
+ pgsz, (uintptr_t)seg->s_base);
+ err = segvn_demote_range(seg, addr, len,
+ SDR_END, szcvec);
+ }
if (err == 0)
return (IE_RETRY);
if (err == ENOMEM)
@@ -4993,7 +5307,7 @@
return (0); /* all done */
}
svd->prot = (uchar_t)prot;
- } else {
+ } else if (svd->type == MAP_PRIVATE) {
struct anon *ap = NULL;
page_t *pp;
u_offset_t offset, off;
@@ -5026,10 +5340,7 @@
*/
for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) {
- ASSERT(seg->s_szc == 0 ||
- (svd->vp != NULL || svd->type == MAP_PRIVATE));
-
- if (seg->s_szc != 0 && svd->type == MAP_PRIVATE) {
+ if (seg->s_szc != 0) {
if (amp != NULL) {
anon_array_enter(amp, anon_idx,
&cookie);
@@ -5054,8 +5365,7 @@
}
if (VPP_ISPPLOCK(svp) &&
- (VPP_PROT(svp) != prot) &&
- (svd->type == MAP_PRIVATE)) {
+ VPP_PROT(svp) != prot) {
if (amp == NULL || ap == NULL) {
vp = svd->vp;
@@ -5109,9 +5419,17 @@
SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
return (IE_NOMEM);
}
- }
-
- if ((prot & PROT_WRITE) != 0 || (prot & ~PROT_USER) == PROT_NONE) {
+ } else {
+ segvn_vpage(seg);
+ evp = &svd->vpage[seg_page(seg, addr + len)];
+ for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) {
+ VPP_SETPROT(svp, prot);
+ }
+ }
+
+ if (((prot & PROT_WRITE) != 0 &&
+ (svd->vp != NULL || svd->type == MAP_PRIVATE)) ||
+ (prot & ~PROT_USER) == PROT_NONE) {
/*
* Either private or shared data with write access (in
* which case we need to throw out all former translations
@@ -5152,6 +5470,7 @@
struct seg *nseg;
caddr_t eaddr = addr + len, a;
size_t pgsz = page_get_pagesize(szc);
+ pgcnt_t pgcnt = page_get_pagecnt(szc);
int err;
u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base);
extern struct vnode kvp;
@@ -5178,8 +5497,16 @@
return (EINVAL);
}
- if ((svd->vp == NULL && svd->type == MAP_SHARED) ||
- (svd->flags & MAP_NORESERVE) || seg->s_as == &kas ||
+ if (amp != NULL && svd->type == MAP_SHARED) {
+ ulong_t an_idx = svd->anon_index + seg_page(seg, addr);
+ if (!IS_P2ALIGNED(an_idx, pgcnt)) {
+
+ segvn_setpgsz_anon_align_err++;
+ return (EINVAL);
+ }
+ }
+
+ if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas ||
szc > segvn_maxpgszc) {
return (EINVAL);
}
@@ -5237,7 +5564,7 @@
if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) {
if (szc < seg->s_szc) {
VM_STAT_ADD(segvnvmstats.demoterange[2]);
- err = segvn_demote_range(seg, addr, len, SDR_RANGE);
+ err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0);
if (err == 0) {
return (IE_RETRY);
}
@@ -5313,10 +5640,11 @@
* new szc.
*/
if (amp != NULL) {
- pgcnt_t pgcnt = pgsz >> PAGESHIFT;
if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) {
struct anon_hdr *nahp;
+ ASSERT(svd->type == MAP_PRIVATE);
+
ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
ASSERT(amp->refcnt == 1);
nahp = anon_create(btop(amp->size), ANON_NOSLEEP);
@@ -5371,7 +5699,11 @@
if (amp != NULL) {
ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
- amp->a_szc = szc;
+ if (svd->type == MAP_PRIVATE) {
+ amp->a_szc = szc;
+ } else if (szc > amp->a_szc) {
+ amp->a_szc = szc;
+ }
ANON_LOCK_EXIT(&->a_rwlock);
}
@@ -5399,8 +5731,6 @@
ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) ||
SEGVN_WRITE_HELD(seg->s_as, &svd->lock));
- ASSERT(svd->type == MAP_PRIVATE ||
- (vp != NULL && svd->amp == NULL));
if (vp == NULL && amp == NULL) {
seg->s_szc = 0;
@@ -5415,7 +5745,7 @@
hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size,
HAT_UNLOAD_UNMAP);
- if (amp == NULL) {
+ if (amp == NULL || svd->type == MAP_SHARED) {
seg->s_szc = 0;
return (0);
}
@@ -5575,7 +5905,6 @@
struct segvn_data *nsvd;
ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
- ASSERT(svd->type == MAP_PRIVATE || svd->amp == NULL);
ASSERT(addr >= seg->s_base);
ASSERT(addr <= seg->s_base + seg->s_size);
@@ -5628,7 +5957,7 @@
bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes);
kmem_free(ovpage, bytes + nbytes);
}
- if (svd->amp != NULL) {
+ if (svd->amp != NULL && svd->type == MAP_PRIVATE) {
struct anon_map *oamp = svd->amp, *namp;
struct anon_hdr *nahp;
@@ -5650,6 +5979,15 @@
nsvd->amp = namp;
nsvd->anon_index = 0;
ANON_LOCK_EXIT(&oamp->a_rwlock);
+ } else if (svd->amp != NULL) {
+ pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc);
+ ASSERT(svd->amp == nsvd->amp);
+ ASSERT(seg->s_szc <= svd->amp->a_szc);
+ nsvd->anon_index = svd->anon_index + seg_pages(seg);
+ ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt));
+ ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER);
+ svd->amp->refcnt++;
+ ANON_LOCK_EXIT(&svd->amp->a_rwlock);
}
/*
@@ -5681,7 +6019,6 @@
return (nseg);
}
-
/*
* called on memory operations (unmap, setprot, setpagesize) for a subset
* of a large page segment to either demote the memory range (SDR_RANGE)
@@ -5690,7 +6027,12 @@
* returns 0 on success. returns errno, including ENOMEM, on failure.
*/
static int
-segvn_demote_range(struct seg *seg, caddr_t addr, size_t len, int flag)
+segvn_demote_range(
+ struct seg *seg,
+ caddr_t addr,
+ size_t len,
+ int flag,
+ uint_t szcvec)
{
caddr_t eaddr = addr + len;
caddr_t lpgaddr, lpgeaddr;
@@ -5700,15 +6042,16 @@
size_t pgsz;
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
int err;
+ uint_t szc = seg->s_szc;
+ uint_t tszcvec;
ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
- ASSERT(seg->s_szc != 0);
- pgsz = page_get_pagesize(seg->s_szc);
+ ASSERT(szc != 0);
+ pgsz = page_get_pagesize(szc);
ASSERT(seg->s_base != addr || seg->s_size != len);
ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size);
ASSERT(svd->softlockcnt == 0);
- ASSERT(svd->type == MAP_PRIVATE ||
- (svd->vp != NULL && svd->amp == NULL));
+ ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED));
CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr);
ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr);
@@ -5749,25 +6092,77 @@
}
ASSERT(badseg1 != NULL);
- ASSERT(badseg1->s_szc != 0);
- ASSERT(page_get_pagesize(badseg1->s_szc) == pgsz);
+ ASSERT(badseg1->s_szc == szc);
ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz ||
badseg1->s_size == 2 * pgsz);
+ ASSERT(sameprot(badseg1, badseg1->s_base, pgsz));
+ ASSERT(badseg1->s_size == pgsz ||
+ sameprot(badseg1, badseg1->s_base + pgsz, pgsz));
if (err = segvn_clrszc(badseg1)) {
return (err);
}
ASSERT(badseg1->s_szc == 0);
+ if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) {
+ uint_t tszc = highbit(tszcvec) - 1;
+ caddr_t ta = MAX(addr, badseg1->s_base);
+ caddr_t te;
+ size_t tpgsz = page_get_pagesize(tszc);
+
+ ASSERT(svd->type == MAP_SHARED);
+ ASSERT(flag == SDR_END);
+ ASSERT(tszc < szc && tszc > 0);
+
+ if (eaddr > badseg1->s_base + badseg1->s_size) {
+ te = badseg1->s_base + badseg1->s_size;
+ } else {
+ te = eaddr;
+ }
+
+ ASSERT(ta <= te);
+ badseg1->s_szc = tszc;
+ if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) {
+ if (badseg2 != NULL) {
+ err = segvn_demote_range(badseg1, ta, te - ta,
+ SDR_END, tszcvec);
+ if (err != 0) {
+ return (err);
+ }
+ } else {
+ return (segvn_demote_range(badseg1, ta,
+ te - ta, SDR_END, tszcvec));
+ }
+ }
+ }
+
if (badseg2 == NULL)
return (0);
- ASSERT(badseg2->s_szc != 0);
- ASSERT(page_get_pagesize(badseg2->s_szc) == pgsz);
+ ASSERT(badseg2->s_szc == szc);
ASSERT(badseg2->s_size == pgsz);
ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size));
if (err = segvn_clrszc(badseg2)) {
return (err);
}
ASSERT(badseg2->s_szc == 0);
+
+ if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) {
+ uint_t tszc = highbit(tszcvec) - 1;
+ size_t tpgsz = page_get_pagesize(tszc);
+
+ ASSERT(svd->type == MAP_SHARED);
+ ASSERT(flag == SDR_END);
+ ASSERT(tszc < szc && tszc > 0);
+ ASSERT(badseg2->s_base > addr);
+ ASSERT(eaddr > badseg2->s_base);
+ ASSERT(eaddr < badseg2->s_base + badseg2->s_size);
+
+ badseg2->s_szc = tszc;
+ if (!IS_P2ALIGNED(eaddr, tpgsz)) {
+ return (segvn_demote_range(badseg2, badseg2->s_base,
+ eaddr - badseg2->s_base, SDR_END, tszcvec));
+ }
+ }
+
return (0);
}
@@ -7344,6 +7739,7 @@
caddr_t a;
size_t page;
caddr_t lpgaddr, lpgeaddr;
+ pgcnt_t szc0_npages = 0;
TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START,
"segvn_pagelock: start seg %p addr %p", seg, addr);
@@ -7520,18 +7916,24 @@
}
}
- mutex_enter(&freemem_lock);
- if (availrmem < tune.t_minarmem + npages) {
+ /*
+ * Avoid per page overhead of segvn_pp_lock_anonpages() for small
+ * pages. For large pages segvn_pp_lock_anonpages() only does real
+ * work once per large page. The tradeoff is that we may decrement
+ * availrmem more than once for the same page but this is ok
+ * for small pages.
+ */
+ if (seg->s_szc == 0) {
+ mutex_enter(&freemem_lock);
+ if (availrmem < tune.t_minarmem + npages) {
+ mutex_exit(&freemem_lock);
+ mutex_exit(&svd->segp_slock);
+ error = ENOMEM;
+ goto out;
+ }
+ availrmem -= npages;
mutex_exit(&freemem_lock);
- mutex_exit(&svd->segp_slock);
- error = ENOMEM;
- goto out;
- } else {
- svd->softlockcnt += npages;
- availrmem -= npages;
- segvn_pages_locked += npages;
- }
- mutex_exit(&freemem_lock);
+ }
pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP);
pl = pplist;
@@ -7574,11 +7976,29 @@
if (pp == NULL) {
break;
}
+ if (seg->s_szc != 0 || pp->p_szc != 0) {
+ if (!segvn_pp_lock_anonpages(pp, a == addr)) {
+ page_unlock(pp);
+ break;
+ }
+ } else {
+ szc0_npages++;
+ }
*pplist++ = pp;
}
ANON_LOCK_EXIT(&->a_rwlock);
+ ASSERT(npages >= szc0_npages);
+
if (a >= addr + len) {
+ mutex_enter(&freemem_lock);
+ if (seg->s_szc == 0 && npages != szc0_npages) {
+ ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0);
+ availrmem += (npages - szc0_npages);
+ }
+ svd->softlockcnt += npages;
+ segvn_pages_locked += npages;
+ mutex_exit(&freemem_lock);
(void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH,
segvn_reclaim);
mutex_exit(&svd->segp_slock);
@@ -7589,31 +8009,24 @@
}
mutex_exit(&svd->segp_slock);
+ if (seg->s_szc == 0) {
+ mutex_enter(&freemem_lock);
+ availrmem += npages;
+ mutex_exit(&freemem_lock);
+ }
error = EFAULT;
pplist = pl;
np = ((uintptr_t)(a - addr)) >> PAGESHIFT;
while (np > (uint_t)0) {
+ ASSERT(PAGE_LOCKED(*pplist));
+ if (seg->s_szc != 0 || (*pplist)->p_szc != 0) {
+ segvn_pp_unlock_anonpages(*pplist, pplist == pl);
+ }
page_unlock(*pplist);
np--;
pplist++;
}
kmem_free(pl, sizeof (page_t *) * npages);
- mutex_enter(&freemem_lock);
- svd->softlockcnt -= npages;
- availrmem += npages;
- segvn_pages_locked -= npages;
- mutex_exit(&freemem_lock);
- if (svd->softlockcnt <= 0) {
- if (AS_ISUNMAPWAIT(seg->s_as)) {
- mutex_enter(&seg->s_as->a_contents);
- if (AS_ISUNMAPWAIT(seg->s_as)) {
- AS_CLRUNMAPWAIT(seg->s_as);
- cv_broadcast(&seg->s_as->a_cv);
- }
- mutex_exit(&seg->s_as->a_contents);
- }
- }
-
out:
SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
*ppp = NULL;
@@ -7638,6 +8051,7 @@
struct segvn_data *svd = (struct segvn_data *)seg->s_data;
pgcnt_t np, npages;
struct page **pl;
+ pgcnt_t szc0_npages = 0;
#ifdef lint
addr = addr;
@@ -7654,12 +8068,19 @@
}
}
+ ASSERT(svd->vp == NULL && svd->amp != NULL);
+
while (np > (uint_t)0) {
if (rw == S_WRITE) {
hat_setrefmod(*pplist);
} else {
hat_setref(*pplist);
}
+ if (seg->s_szc != 0 || (*pplist)->p_szc != 0) {
+ segvn_pp_unlock_anonpages(*pplist, pplist == pl);
+ } else {
+ szc0_npages++;
+ }
page_unlock(*pplist);
np--;
pplist++;
@@ -7667,9 +8088,11 @@
kmem_free(pl, sizeof (page_t *) * npages);
mutex_enter(&freemem_lock);
- availrmem += npages;
segvn_pages_locked -= npages;
svd->softlockcnt -= npages;
+ if (szc0_npages != 0) {
+ availrmem += szc0_npages;
+ }
mutex_exit(&freemem_lock);
if (svd->softlockcnt <= 0) {
if (AS_ISUNMAPWAIT(seg->s_as)) {