components/golang/patches/0037-release-branch.go1.5-runtime-adjust-huge-page-flags-.patch
changeset 5331 9c955076ffe3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/components/golang/patches/0037-release-branch.go1.5-runtime-adjust-huge-page-flags-.patch	Thu Jan 21 09:20:59 2016 -0800
@@ -0,0 +1,159 @@
+From 244294f821c9256394d29218ee1f17ab4313551f Mon Sep 17 00:00:00 2001
+From: Austin Clements <[email protected]>
+Date: Wed, 30 Sep 2015 11:52:54 -0400
+Subject: [PATCH 37/63] [release-branch.go1.5] runtime: adjust huge page flags
+ only on huge page granularity
+
+This fixes an issue where the runtime panics with "out of memory" or
+"cannot allocate memory" even though there's ample memory by reducing
+the number of memory mappings created by the memory allocator.
+
+Commit 7e1b61c worked around issue #8832 where Linux's transparent
+huge page support could dramatically increase the RSS of a Go process
+by setting the MADV_NOHUGEPAGE flag on any regions of pages released
+to the OS with MADV_DONTNEED. This had the side effect of also
+increasing the number of VMAs (memory mappings) in a Go address space
+because a separate VMA is needed for every region of the virtual
+address space with different flags. Unfortunately, by default, Linux
+limits the number of VMAs in an address space to 65530, and a large
+heap can quickly reach this limit when the runtime starts scavenging
+memory.
+
+This commit dramatically reduces the number of VMAs. It does this
+primarily by only adjusting the huge page flag at huge page
+granularity. With this change, on amd64, even a pessimal heap that
+alternates between MADV_NOHUGEPAGE and MADV_HUGEPAGE must reach 128GB
+to reach the VMA limit. Because of this rounding to huge page
+granularity, this change is also careful to leave large used and
+unused regions huge page-enabled.
+
+This change reduces the maximum number of VMAs during the runtime
+benchmarks with GODEBUG=scavenge=1 from 692 to 49.
+
+Fixes #12233.
+
+Change-Id: Ic397776d042f20d53783a1cacf122e2e2db00584
+Reviewed-on: https://go-review.googlesource.com/15191
+Reviewed-by: Keith Randall <[email protected]>
+Reviewed-on: https://go-review.googlesource.com/16980
+Run-TryBot: Austin Clements <[email protected]>
+Reviewed-by: Ian Lance Taylor <[email protected]>
+Reviewed-by: Russ Cox <[email protected]>
+---
+ src/runtime/mem_linux.go | 94 +++++++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 77 insertions(+), 17 deletions(-)
+
+diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
+index f988e75..e8c8999 100644
+--- a/src/runtime/mem_linux.go
++++ b/src/runtime/mem_linux.go
+@@ -69,29 +69,89 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+ }
+ 
+ func sysUnused(v unsafe.Pointer, n uintptr) {
+-	var s uintptr = hugePageSize // division by constant 0 is a compile-time error :(
+-	if s != 0 && (uintptr(v)%s != 0 || n%s != 0) {
+-		// See issue 8832
+-		// Linux kernel bug: https://bugzilla.kernel.org/show_bug.cgi?id=93111
+-		// Mark the region as NOHUGEPAGE so the kernel's khugepaged
+-		// doesn't undo our DONTNEED request.  khugepaged likes to migrate
+-		// regions which are only partially mapped to huge pages, including
+-		// regions with some DONTNEED marks.  That needlessly allocates physical
+-		// memory for our DONTNEED regions.
+-		madvise(v, n, _MADV_NOHUGEPAGE)
++	// By default, Linux's "transparent huge page" support will
++	// merge pages into a huge page if there's even a single
++	// present regular page, undoing the effects of the DONTNEED
++	// below. On amd64, that means khugepaged can turn a single
++	// 4KB page to 2MB, bloating the process's RSS by as much as
++	// 512X. (See issue #8832 and Linux kernel bug
++	// https://bugzilla.kernel.org/show_bug.cgi?id=93111)
++	//
++	// To work around this, we explicitly disable transparent huge
++	// pages when we release pages of the heap. However, we have
++	// to do this carefully because changing this flag tends to
++	// split the VMA (memory mapping) containing v in to three
++	// VMAs in order to track the different values of the
++	// MADV_NOHUGEPAGE flag in the different regions. There's a
++	// default limit of 65530 VMAs per address space (sysctl
++	// vm.max_map_count), so we must be careful not to create too
++	// many VMAs (see issue #12233).
++	//
++	// Since huge pages are huge, there's little use in adjusting
++	// the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid
++	// exploding the number of VMAs by only adjusting the
++	// MADV_NOHUGEPAGE flag on a large granularity. This still
++	// gets most of the benefit of huge pages while keeping the
++	// number of VMAs under control. With hugePageSize = 2MB, even
++	// a pessimal heap can reach 128GB before running out of VMAs.
++	if hugePageSize != 0 {
++		var s uintptr = hugePageSize // division by constant 0 is a compile-time error :(
++
++		// If it's a large allocation, we want to leave huge
++		// pages enabled. Hence, we only adjust the huge page
++		// flag on the huge pages containing v and v+n-1, and
++		// only if those aren't aligned.
++		var head, tail uintptr
++		if uintptr(v)%s != 0 {
++			// Compute huge page containing v.
++			head = uintptr(v) &^ (s - 1)
++		}
++		if (uintptr(v)+n)%s != 0 {
++			// Compute huge page containing v+n-1.
++			tail = (uintptr(v) + n - 1) &^ (s - 1)
++		}
++
++		// Note that madvise will return EINVAL if the flag is
++		// already set, which is quite likely. We ignore
++		// errors.
++		if head != 0 && head+hugePageSize == tail {
++			// head and tail are different but adjacent,
++			// so do this in one call.
++			madvise(unsafe.Pointer(head), 2*hugePageSize, _MADV_NOHUGEPAGE)
++		} else {
++			// Advise the huge pages containing v and v+n-1.
++			if head != 0 {
++				madvise(unsafe.Pointer(head), hugePageSize, _MADV_NOHUGEPAGE)
++			}
++			if tail != 0 && tail != head {
++				madvise(unsafe.Pointer(tail), hugePageSize, _MADV_NOHUGEPAGE)
++			}
++		}
+ 	}
++
+ 	madvise(v, n, _MADV_DONTNEED)
+ }
+ 
+ func sysUsed(v unsafe.Pointer, n uintptr) {
+ 	if hugePageSize != 0 {
+-		// Undo the NOHUGEPAGE marks from sysUnused.  There is no alignment check
+-		// around this call as spans may have been merged in the interim.
+-		// Note that this might enable huge pages for regions which were
+-		// previously disabled.  Unfortunately there is no easy way to detect
+-		// what the previous state was, and in any case we probably want huge
+-		// pages to back our heap if the kernel can arrange that.
+-		madvise(v, n, _MADV_HUGEPAGE)
++		// Partially undo the NOHUGEPAGE marks from sysUnused
++		// for whole huge pages between v and v+n. This may
++		// leave huge pages off at the end points v and v+n
++		// even though allocations may cover these entire huge
++		// pages. We could detect this and undo NOHUGEPAGE on
++		// the end points as well, but it's probably not worth
++		// the cost because when neighboring allocations are
++		// freed sysUnused will just set NOHUGEPAGE again.
++		var s uintptr = hugePageSize
++
++		// Round v up to a huge page boundary.
++		beg := (uintptr(v) + (s - 1)) &^ (s - 1)
++		// Round v+n down to a huge page boundary.
++		end := (uintptr(v) + n) &^ (s - 1)
++
++		if beg < end {
++			madvise(unsafe.Pointer(beg), end-beg, _MADV_HUGEPAGE)
++		}
+ 	}
+ }
+ 
+-- 
+2.6.1
+