|
1 From 244294f821c9256394d29218ee1f17ab4313551f Mon Sep 17 00:00:00 2001 |
|
2 From: Austin Clements <[email protected]> |
|
3 Date: Wed, 30 Sep 2015 11:52:54 -0400 |
|
4 Subject: [PATCH 37/63] [release-branch.go1.5] runtime: adjust huge page flags |
|
5 only on huge page granularity |
|
6 |
|
7 This fixes an issue where the runtime panics with "out of memory" or |
|
8 "cannot allocate memory" even though there's ample memory by reducing |
|
9 the number of memory mappings created by the memory allocator. |
|
10 |
|
11 Commit 7e1b61c worked around issue #8832 where Linux's transparent |
|
12 huge page support could dramatically increase the RSS of a Go process |
|
13 by setting the MADV_NOHUGEPAGE flag on any regions of pages released |
|
14 to the OS with MADV_DONTNEED. This had the side effect of also |
|
15 increasing the number of VMAs (memory mappings) in a Go address space |
|
16 because a separate VMA is needed for every region of the virtual |
|
17 address space with different flags. Unfortunately, by default, Linux |
|
18 limits the number of VMAs in an address space to 65530, and a large |
|
19 heap can quickly reach this limit when the runtime starts scavenging |
|
20 memory. |
|
21 |
|
22 This commit dramatically reduces the number of VMAs. It does this |
|
23 primarily by only adjusting the huge page flag at huge page |
|
24 granularity. With this change, on amd64, even a pessimal heap that |
|
25 alternates between MADV_NOHUGEPAGE and MADV_HUGEPAGE must reach 128GB |
|
26 to reach the VMA limit. Because of this rounding to huge page |
|
27 granularity, this change is also careful to leave large used and |
|
28 unused regions huge page-enabled. |
|
29 |
|
30 This change reduces the maximum number of VMAs during the runtime |
|
31 benchmarks with GODEBUG=scavenge=1 from 692 to 49. |
|
32 |
|
33 Fixes #12233. |
|
34 |
|
35 Change-Id: Ic397776d042f20d53783a1cacf122e2e2db00584 |
|
36 Reviewed-on: https://go-review.googlesource.com/15191 |
|
37 Reviewed-by: Keith Randall <[email protected]> |
|
38 Reviewed-on: https://go-review.googlesource.com/16980 |
|
39 Run-TryBot: Austin Clements <[email protected]> |
|
40 Reviewed-by: Ian Lance Taylor <[email protected]> |
|
41 Reviewed-by: Russ Cox <[email protected]> |
|
42 --- |
|
43 src/runtime/mem_linux.go | 94 +++++++++++++++++++++++++++++++++++++++--------- |
|
44 1 file changed, 77 insertions(+), 17 deletions(-) |
|
45 |
|
46 diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go |
|
47 index f988e75..e8c8999 100644 |
|
48 --- a/src/runtime/mem_linux.go |
|
49 +++ b/src/runtime/mem_linux.go |
|
50 @@ -69,29 +69,89 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer { |
|
51 } |
|
52 |
|
53 func sysUnused(v unsafe.Pointer, n uintptr) { |
|
54 - var s uintptr = hugePageSize // division by constant 0 is a compile-time error :( |
|
55 - if s != 0 && (uintptr(v)%s != 0 || n%s != 0) { |
|
56 - // See issue 8832 |
|
57 - // Linux kernel bug: https://bugzilla.kernel.org/show_bug.cgi?id=93111 |
|
58 - // Mark the region as NOHUGEPAGE so the kernel's khugepaged |
|
59 - // doesn't undo our DONTNEED request. khugepaged likes to migrate |
|
60 - // regions which are only partially mapped to huge pages, including |
|
61 - // regions with some DONTNEED marks. That needlessly allocates physical |
|
62 - // memory for our DONTNEED regions. |
|
63 - madvise(v, n, _MADV_NOHUGEPAGE) |
|
64 + // By default, Linux's "transparent huge page" support will |
|
65 + // merge pages into a huge page if there's even a single |
|
66 + // present regular page, undoing the effects of the DONTNEED |
|
67 + // below. On amd64, that means khugepaged can turn a single |
|
68 + // 4KB page to 2MB, bloating the process's RSS by as much as |
|
69 + // 512X. (See issue #8832 and Linux kernel bug |
|
70 + // https://bugzilla.kernel.org/show_bug.cgi?id=93111) |
|
71 + // |
|
72 + // To work around this, we explicitly disable transparent huge |
|
73 + // pages when we release pages of the heap. However, we have |
|
74 + // to do this carefully because changing this flag tends to |
|
75 + // split the VMA (memory mapping) containing v in to three |
|
76 + // VMAs in order to track the different values of the |
|
77 + // MADV_NOHUGEPAGE flag in the different regions. There's a |
|
78 + // default limit of 65530 VMAs per address space (sysctl |
|
79 + // vm.max_map_count), so we must be careful not to create too |
|
80 + // many VMAs (see issue #12233). |
|
81 + // |
|
82 + // Since huge pages are huge, there's little use in adjusting |
|
83 + // the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid |
|
84 + // exploding the number of VMAs by only adjusting the |
|
85 + // MADV_NOHUGEPAGE flag on a large granularity. This still |
|
86 + // gets most of the benefit of huge pages while keeping the |
|
87 + // number of VMAs under control. With hugePageSize = 2MB, even |
|
88 + // a pessimal heap can reach 128GB before running out of VMAs. |
|
89 + if hugePageSize != 0 { |
|
90 + var s uintptr = hugePageSize // division by constant 0 is a compile-time error :( |
|
91 + |
|
92 + // If it's a large allocation, we want to leave huge |
|
93 + // pages enabled. Hence, we only adjust the huge page |
|
94 + // flag on the huge pages containing v and v+n-1, and |
|
95 + // only if those aren't aligned. |
|
96 + var head, tail uintptr |
|
97 + if uintptr(v)%s != 0 { |
|
98 + // Compute huge page containing v. |
|
99 + head = uintptr(v) &^ (s - 1) |
|
100 + } |
|
101 + if (uintptr(v)+n)%s != 0 { |
|
102 + // Compute huge page containing v+n-1. |
|
103 + tail = (uintptr(v) + n - 1) &^ (s - 1) |
|
104 + } |
|
105 + |
|
106 + // Note that madvise will return EINVAL if the flag is |
|
107 + // already set, which is quite likely. We ignore |
|
108 + // errors. |
|
109 + if head != 0 && head+hugePageSize == tail { |
|
110 + // head and tail are different but adjacent, |
|
111 + // so do this in one call. |
|
112 + madvise(unsafe.Pointer(head), 2*hugePageSize, _MADV_NOHUGEPAGE) |
|
113 + } else { |
|
114 + // Advise the huge pages containing v and v+n-1. |
|
115 + if head != 0 { |
|
116 + madvise(unsafe.Pointer(head), hugePageSize, _MADV_NOHUGEPAGE) |
|
117 + } |
|
118 + if tail != 0 && tail != head { |
|
119 + madvise(unsafe.Pointer(tail), hugePageSize, _MADV_NOHUGEPAGE) |
|
120 + } |
|
121 + } |
|
122 } |
|
123 + |
|
124 madvise(v, n, _MADV_DONTNEED) |
|
125 } |
|
126 |
|
127 func sysUsed(v unsafe.Pointer, n uintptr) { |
|
128 if hugePageSize != 0 { |
|
129 - // Undo the NOHUGEPAGE marks from sysUnused. There is no alignment check |
|
130 - // around this call as spans may have been merged in the interim. |
|
131 - // Note that this might enable huge pages for regions which were |
|
132 - // previously disabled. Unfortunately there is no easy way to detect |
|
133 - // what the previous state was, and in any case we probably want huge |
|
134 - // pages to back our heap if the kernel can arrange that. |
|
135 - madvise(v, n, _MADV_HUGEPAGE) |
|
136 + // Partially undo the NOHUGEPAGE marks from sysUnused |
|
137 + // for whole huge pages between v and v+n. This may |
|
138 + // leave huge pages off at the end points v and v+n |
|
139 + // even though allocations may cover these entire huge |
|
140 + // pages. We could detect this and undo NOHUGEPAGE on |
|
141 + // the end points as well, but it's probably not worth |
|
142 + // the cost because when neighboring allocations are |
|
143 + // freed sysUnused will just set NOHUGEPAGE again. |
|
144 + var s uintptr = hugePageSize |
|
145 + |
|
146 + // Round v up to a huge page boundary. |
|
147 + beg := (uintptr(v) + (s - 1)) &^ (s - 1) |
|
148 + // Round v+n down to a huge page boundary. |
|
149 + end := (uintptr(v) + n) &^ (s - 1) |
|
150 + |
|
151 + if beg < end { |
|
152 + madvise(unsafe.Pointer(beg), end-beg, _MADV_HUGEPAGE) |
|
153 + } |
|
154 } |
|
155 } |
|
156 |
|
157 -- |
|
158 2.6.1 |
|
159 |