components/golang/patches/0037-release-branch.go1.5-runtime-adjust-huge-page-flags-.patch
changeset 5331 9c955076ffe3
equal deleted inserted replaced
5330:c36e3195e3e9 5331:9c955076ffe3
       
     1 From 244294f821c9256394d29218ee1f17ab4313551f Mon Sep 17 00:00:00 2001
       
     2 From: Austin Clements <[email protected]>
       
     3 Date: Wed, 30 Sep 2015 11:52:54 -0400
       
     4 Subject: [PATCH 37/63] [release-branch.go1.5] runtime: adjust huge page flags
       
     5  only on huge page granularity
       
     6 
       
     7 This fixes an issue where the runtime panics with "out of memory" or
       
     8 "cannot allocate memory" even though there's ample memory by reducing
       
     9 the number of memory mappings created by the memory allocator.
       
    10 
       
    11 Commit 7e1b61c worked around issue #8832 where Linux's transparent
       
    12 huge page support could dramatically increase the RSS of a Go process
       
    13 by setting the MADV_NOHUGEPAGE flag on any regions of pages released
       
    14 to the OS with MADV_DONTNEED. This had the side effect of also
       
    15 increasing the number of VMAs (memory mappings) in a Go address space
       
    16 because a separate VMA is needed for every region of the virtual
       
    17 address space with different flags. Unfortunately, by default, Linux
       
    18 limits the number of VMAs in an address space to 65530, and a large
       
    19 heap can quickly reach this limit when the runtime starts scavenging
       
    20 memory.
       
    21 
       
    22 This commit dramatically reduces the number of VMAs. It does this
       
    23 primarily by only adjusting the huge page flag at huge page
       
    24 granularity. With this change, on amd64, even a pessimal heap that
       
    25 alternates between MADV_NOHUGEPAGE and MADV_HUGEPAGE must reach 128GB
       
    26 to reach the VMA limit. Because of this rounding to huge page
       
    27 granularity, this change is also careful to leave large used and
       
    28 unused regions huge page-enabled.
       
    29 
       
    30 This change reduces the maximum number of VMAs during the runtime
       
    31 benchmarks with GODEBUG=scavenge=1 from 692 to 49.
       
    32 
       
    33 Fixes #12233.
       
    34 
       
    35 Change-Id: Ic397776d042f20d53783a1cacf122e2e2db00584
       
    36 Reviewed-on: https://go-review.googlesource.com/15191
       
    37 Reviewed-by: Keith Randall <[email protected]>
       
    38 Reviewed-on: https://go-review.googlesource.com/16980
       
    39 Run-TryBot: Austin Clements <[email protected]>
       
    40 Reviewed-by: Ian Lance Taylor <[email protected]>
       
    41 Reviewed-by: Russ Cox <[email protected]>
       
    42 ---
       
    43  src/runtime/mem_linux.go | 94 +++++++++++++++++++++++++++++++++++++++---------
       
    44  1 file changed, 77 insertions(+), 17 deletions(-)
       
    45 
       
    46 diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
       
    47 index f988e75..e8c8999 100644
       
    48 --- a/src/runtime/mem_linux.go
       
    49 +++ b/src/runtime/mem_linux.go
       
    50 @@ -69,29 +69,89 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
       
    51  }
       
    52  
       
    53  func sysUnused(v unsafe.Pointer, n uintptr) {
       
    54 -	var s uintptr = hugePageSize // division by constant 0 is a compile-time error :(
       
    55 -	if s != 0 && (uintptr(v)%s != 0 || n%s != 0) {
       
    56 -		// See issue 8832
       
    57 -		// Linux kernel bug: https://bugzilla.kernel.org/show_bug.cgi?id=93111
       
    58 -		// Mark the region as NOHUGEPAGE so the kernel's khugepaged
       
    59 -		// doesn't undo our DONTNEED request.  khugepaged likes to migrate
       
    60 -		// regions which are only partially mapped to huge pages, including
       
    61 -		// regions with some DONTNEED marks.  That needlessly allocates physical
       
    62 -		// memory for our DONTNEED regions.
       
    63 -		madvise(v, n, _MADV_NOHUGEPAGE)
       
    64 +	// By default, Linux's "transparent huge page" support will
       
    65 +	// merge pages into a huge page if there's even a single
       
    66 +	// present regular page, undoing the effects of the DONTNEED
       
    67 +	// below. On amd64, that means khugepaged can turn a single
       
    68 +	// 4KB page to 2MB, bloating the process's RSS by as much as
       
    69 +	// 512X. (See issue #8832 and Linux kernel bug
       
    70 +	// https://bugzilla.kernel.org/show_bug.cgi?id=93111)
       
    71 +	//
       
    72 +	// To work around this, we explicitly disable transparent huge
       
    73 +	// pages when we release pages of the heap. However, we have
       
    74 +	// to do this carefully because changing this flag tends to
       
    75 +	// split the VMA (memory mapping) containing v in to three
       
    76 +	// VMAs in order to track the different values of the
       
    77 +	// MADV_NOHUGEPAGE flag in the different regions. There's a
       
    78 +	// default limit of 65530 VMAs per address space (sysctl
       
    79 +	// vm.max_map_count), so we must be careful not to create too
       
    80 +	// many VMAs (see issue #12233).
       
    81 +	//
       
    82 +	// Since huge pages are huge, there's little use in adjusting
       
    83 +	// the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid
       
    84 +	// exploding the number of VMAs by only adjusting the
       
    85 +	// MADV_NOHUGEPAGE flag on a large granularity. This still
       
    86 +	// gets most of the benefit of huge pages while keeping the
       
    87 +	// number of VMAs under control. With hugePageSize = 2MB, even
       
    88 +	// a pessimal heap can reach 128GB before running out of VMAs.
       
    89 +	if hugePageSize != 0 {
       
    90 +		var s uintptr = hugePageSize // division by constant 0 is a compile-time error :(
       
    91 +
       
    92 +		// If it's a large allocation, we want to leave huge
       
    93 +		// pages enabled. Hence, we only adjust the huge page
       
    94 +		// flag on the huge pages containing v and v+n-1, and
       
    95 +		// only if those aren't aligned.
       
    96 +		var head, tail uintptr
       
    97 +		if uintptr(v)%s != 0 {
       
    98 +			// Compute huge page containing v.
       
    99 +			head = uintptr(v) &^ (s - 1)
       
   100 +		}
       
   101 +		if (uintptr(v)+n)%s != 0 {
       
   102 +			// Compute huge page containing v+n-1.
       
   103 +			tail = (uintptr(v) + n - 1) &^ (s - 1)
       
   104 +		}
       
   105 +
       
   106 +		// Note that madvise will return EINVAL if the flag is
       
   107 +		// already set, which is quite likely. We ignore
       
   108 +		// errors.
       
   109 +		if head != 0 && head+hugePageSize == tail {
       
   110 +			// head and tail are different but adjacent,
       
   111 +			// so do this in one call.
       
   112 +			madvise(unsafe.Pointer(head), 2*hugePageSize, _MADV_NOHUGEPAGE)
       
   113 +		} else {
       
   114 +			// Advise the huge pages containing v and v+n-1.
       
   115 +			if head != 0 {
       
   116 +				madvise(unsafe.Pointer(head), hugePageSize, _MADV_NOHUGEPAGE)
       
   117 +			}
       
   118 +			if tail != 0 && tail != head {
       
   119 +				madvise(unsafe.Pointer(tail), hugePageSize, _MADV_NOHUGEPAGE)
       
   120 +			}
       
   121 +		}
       
   122  	}
       
   123 +
       
   124  	madvise(v, n, _MADV_DONTNEED)
       
   125  }
       
   126  
       
   127  func sysUsed(v unsafe.Pointer, n uintptr) {
       
   128  	if hugePageSize != 0 {
       
   129 -		// Undo the NOHUGEPAGE marks from sysUnused.  There is no alignment check
       
   130 -		// around this call as spans may have been merged in the interim.
       
   131 -		// Note that this might enable huge pages for regions which were
       
   132 -		// previously disabled.  Unfortunately there is no easy way to detect
       
   133 -		// what the previous state was, and in any case we probably want huge
       
   134 -		// pages to back our heap if the kernel can arrange that.
       
   135 -		madvise(v, n, _MADV_HUGEPAGE)
       
   136 +		// Partially undo the NOHUGEPAGE marks from sysUnused
       
   137 +		// for whole huge pages between v and v+n. This may
       
   138 +		// leave huge pages off at the end points v and v+n
       
   139 +		// even though allocations may cover these entire huge
       
   140 +		// pages. We could detect this and undo NOHUGEPAGE on
       
   141 +		// the end points as well, but it's probably not worth
       
   142 +		// the cost because when neighboring allocations are
       
   143 +		// freed sysUnused will just set NOHUGEPAGE again.
       
   144 +		var s uintptr = hugePageSize
       
   145 +
       
   146 +		// Round v up to a huge page boundary.
       
   147 +		beg := (uintptr(v) + (s - 1)) &^ (s - 1)
       
   148 +		// Round v+n down to a huge page boundary.
       
   149 +		end := (uintptr(v) + n) &^ (s - 1)
       
   150 +
       
   151 +		if beg < end {
       
   152 +			madvise(unsafe.Pointer(beg), end-beg, _MADV_HUGEPAGE)
       
   153 +		}
       
   154  	}
       
   155  }
       
   156  
       
   157 -- 
       
   158 2.6.1
       
   159