|
1 From 9f59bc85a2c9932caebf0b6c6a282a0e839d62af Mon Sep 17 00:00:00 2001 |
|
2 From: Michael Hudson-Doyle <[email protected]> |
|
3 Date: Tue, 22 Sep 2015 22:35:52 +1200 |
|
4 Subject: [PATCH 28/63] [release-branch.go1.5] runtime: adjust the ppc64x |
|
5 memmove and memclr to copy by word as much as it can |
|
6 MIME-Version: 1.0 |
|
7 Content-Type: text/plain; charset=UTF-8 |
|
8 Content-Transfer-Encoding: 8bit |
|
9 |
|
10 Issue #12552 can happen on ppc64 too, although much less frequently in my |
|
11 testing. I'm fairly sure this fixes it (2 out of 200 runs of oracle.test failed |
|
12 without this change and 0 of 200 failed with it). It's also a lot faster for |
|
13 large moves/clears: |
|
14 |
|
15 name old speed new speed delta |
|
16 Memmove1-6 157MB/s ± 9% 144MB/s ± 0% -8.20% (p=0.004 n=10+9) |
|
17 Memmove2-6 281MB/s ± 1% 249MB/s ± 1% -11.53% (p=0.000 n=10+10) |
|
18 Memmove3-6 376MB/s ± 1% 328MB/s ± 1% -12.64% (p=0.000 n=10+10) |
|
19 Memmove4-6 475MB/s ± 4% 345MB/s ± 1% -27.28% (p=0.000 n=10+8) |
|
20 Memmove5-6 540MB/s ± 1% 393MB/s ± 0% -27.21% (p=0.000 n=10+10) |
|
21 Memmove6-6 609MB/s ± 0% 423MB/s ± 0% -30.56% (p=0.000 n=9+10) |
|
22 Memmove7-6 659MB/s ± 0% 468MB/s ± 0% -28.99% (p=0.000 n=8+10) |
|
23 Memmove8-6 705MB/s ± 0% 1295MB/s ± 1% +83.73% (p=0.000 n=9+9) |
|
24 Memmove9-6 740MB/s ± 1% 1241MB/s ± 1% +67.61% (p=0.000 n=10+8) |
|
25 Memmove10-6 780MB/s ± 0% 1162MB/s ± 1% +48.95% (p=0.000 n=10+9) |
|
26 Memmove11-6 811MB/s ± 0% 1180MB/s ± 0% +45.58% (p=0.000 n=8+9) |
|
27 Memmove12-6 820MB/s ± 1% 1073MB/s ± 1% +30.83% (p=0.000 n=10+9) |
|
28 Memmove13-6 849MB/s ± 0% 1068MB/s ± 1% +25.87% (p=0.000 n=10+10) |
|
29 Memmove14-6 877MB/s ± 0% 911MB/s ± 0% +3.83% (p=0.000 n=10+10) |
|
30 Memmove15-6 893MB/s ± 0% 922MB/s ± 0% +3.25% (p=0.000 n=10+9) |
|
31 Memmove16-6 897MB/s ± 1% 2418MB/s ± 1% +169.67% (p=0.000 n=10+9) |
|
32 Memmove32-6 908MB/s ± 0% 3927MB/s ± 2% +332.64% (p=0.000 n=10+8) |
|
33 Memmove64-6 1.11GB/s ± 0% 5.59GB/s ± 0% +404.64% (p=0.000 n=9+9) |
|
34 Memmove128-6 1.25GB/s ± 0% 6.71GB/s ± 2% +437.49% (p=0.000 n=9+10) |
|
35 Memmove256-6 1.33GB/s ± 0% 7.25GB/s ± 1% +445.06% (p=0.000 n=10+10) |
|
36 Memmove512-6 1.38GB/s ± 0% 8.87GB/s ± 0% +544.43% (p=0.000 n=10+10) |
|
37 Memmove1024-6 1.40GB/s ± 0% 10.00GB/s ± 0% +613.80% (p=0.000 n=10+10) |
|
38 Memmove2048-6 1.41GB/s ± 0% 10.65GB/s ± 0% +652.95% (p=0.000 n=9+10) |
|
39 Memmove4096-6 1.42GB/s ± 0% 11.01GB/s ± 0% +675.37% (p=0.000 n=8+10) |
|
40 Memclr5-6 269MB/s ± 1% 264MB/s ± 0% -1.80% (p=0.000 n=10+10) |
|
41 Memclr16-6 600MB/s ± 0% 887MB/s ± 1% +47.83% (p=0.000 n=10+10) |
|
42 Memclr64-6 1.06GB/s ± 0% 2.91GB/s ± 1% +174.58% (p=0.000 n=8+10) |
|
43 Memclr256-6 1.32GB/s ± 0% 6.58GB/s ± 0% +399.86% (p=0.000 n=9+10) |
|
44 Memclr4096-6 1.42GB/s ± 0% 10.90GB/s ± 0% +668.03% (p=0.000 n=8+10) |
|
45 Memclr65536-6 1.43GB/s ± 0% 11.37GB/s ± 0% +697.83% (p=0.000 n=9+8) |
|
46 GoMemclr5-6 359MB/s ± 0% 360MB/s ± 0% +0.46% (p=0.000 n=10+10) |
|
47 GoMemclr16-6 750MB/s ± 0% 1264MB/s ± 1% +68.45% (p=0.000 n=10+10) |
|
48 GoMemclr64-6 1.17GB/s ± 0% 3.78GB/s ± 1% +223.58% (p=0.000 n=10+9) |
|
49 GoMemclr256-6 1.35GB/s ± 0% 7.47GB/s ± 0% +452.44% (p=0.000 n=10+10) |
|
50 |
|
51 Update #12552 |
|
52 |
|
53 Change-Id: I7192e9deb9684a843aed37f58a16a4e29970e893 |
|
54 Reviewed-on: https://go-review.googlesource.com/14840 |
|
55 Reviewed-by: Minux Ma <[email protected]> |
|
56 Reviewed-on: https://go-review.googlesource.com/16907 |
|
57 Reviewed-by: Russ Cox <[email protected]> |
|
58 --- |
|
59 src/runtime/memclr_ppc64x.s | 17 ++++++++-- |
|
60 src/runtime/memmove_ppc64x.s | 78 +++++++++++++++++++++++++++++++++++--------- |
|
61 2 files changed, 77 insertions(+), 18 deletions(-) |
|
62 |
|
63 diff --git a/src/runtime/memclr_ppc64x.s b/src/runtime/memclr_ppc64x.s |
|
64 index cea42cb..90e2748 100644 |
|
65 --- a/src/runtime/memclr_ppc64x.s |
|
66 +++ b/src/runtime/memclr_ppc64x.s |
|
67 @@ -10,11 +10,22 @@ |
|
68 TEXT runtime·memclr(SB),NOSPLIT,$0-16 |
|
69 MOVD ptr+0(FP), R3 |
|
70 MOVD n+8(FP), R4 |
|
71 - CMP R4, $0 |
|
72 + SRADCC $3, R4, R6 // R6 is the number of words to zero |
|
73 + BEQ bytes |
|
74 + |
|
75 + SUB $8, R3 |
|
76 + MOVD R6, CTR |
|
77 + MOVDU R0, 8(R3) |
|
78 + BC 25, 0, -1(PC) // bdnz+ $-4 |
|
79 + ADD $8, R3 |
|
80 + |
|
81 +bytes: |
|
82 + ANDCC $7, R4, R7 // R7 is the number of bytes to zero |
|
83 BEQ done |
|
84 SUB $1, R3 |
|
85 - MOVD R4, CTR |
|
86 + MOVD R7, CTR |
|
87 MOVBU R0, 1(R3) |
|
88 - BC 25, 0, -1(PC) // bdnz+ $-4 |
|
89 + BC 25, 0, -1(PC) // bdnz+ $-4 |
|
90 + |
|
91 done: |
|
92 RET |
|
93 diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s |
|
94 index 3ada63e..72c90de 100644 |
|
95 --- a/src/runtime/memmove_ppc64x.s |
|
96 +++ b/src/runtime/memmove_ppc64x.s |
|
97 @@ -16,25 +16,73 @@ TEXT runtime·memmove(SB), NOSPLIT, $-8-24 |
|
98 RET |
|
99 |
|
100 check: |
|
101 - CMP R3, R4 |
|
102 - BGT backward |
|
103 + ANDCC $7, R5, R7 // R7 is the number of bytes to copy and CR0[EQ] is set if there are none. |
|
104 + SRAD $3, R5, R6 // R6 is the number of words to copy |
|
105 + CMP R6, $0, CR1 // CR1[EQ] is set if there are no words to copy. |
|
106 |
|
107 + CMP R3, R4, CR2 |
|
108 + BC 12, 9, backward // I think you should be able to write this as "BGT CR2, backward" |
|
109 + |
|
110 + // Copying forward proceeds by copying R6 words then copying R7 bytes. |
|
111 + // R3 and R4 are advanced as we copy. Becuase PPC64 lacks post-increment |
|
112 + // load/store, R3 and R4 point before the bytes that are to be copied. |
|
113 + |
|
114 + BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge" |
|
115 + |
|
116 + MOVD R6, CTR |
|
117 + |
|
118 + SUB $8, R3 |
|
119 + SUB $8, R4 |
|
120 + |
|
121 +forwardlargeloop: |
|
122 + MOVDU 8(R4), R8 |
|
123 + MOVDU R8, 8(R3) |
|
124 + BC 16, 0, forwardlargeloop // "BDNZ" |
|
125 + |
|
126 + ADD $8, R3 |
|
127 + ADD $8, R4 |
|
128 + |
|
129 +noforwardlarge: |
|
130 + BNE forwardtail // Tests the bit set by ANDCC above |
|
131 + RET |
|
132 + |
|
133 +forwardtail: |
|
134 SUB $1, R3 |
|
135 - ADD R3, R5 |
|
136 SUB $1, R4 |
|
137 -loop: |
|
138 - MOVBU 1(R4), R6 |
|
139 - MOVBU R6, 1(R3) |
|
140 - CMP R3, R5 |
|
141 - BNE loop |
|
142 + MOVD R7, CTR |
|
143 + |
|
144 +forwardtailloop: |
|
145 + MOVBZU 1(R4), R8 |
|
146 + MOVBZU R8, 1(R3) |
|
147 + BC 16, 0, forwardtailloop |
|
148 RET |
|
149 |
|
150 backward: |
|
151 - ADD R5, R4 |
|
152 - ADD R3, R5 |
|
153 -loop1: |
|
154 - MOVBU -1(R4), R6 |
|
155 - MOVBU R6, -1(R5) |
|
156 - CMP R3, R5 |
|
157 - BNE loop1 |
|
158 + // Copying backwards proceeds by copying R7 bytes then copying R6 words. |
|
159 + // R3 and R4 are advanced to the end of the destination/source buffers |
|
160 + // respectively and moved back as we copy. |
|
161 + |
|
162 + ADD R5, R4, R4 |
|
163 + ADD R3, R5, R3 |
|
164 + |
|
165 + BEQ nobackwardtail |
|
166 + |
|
167 + MOVD R7, CTR |
|
168 + |
|
169 +backwardtailloop: |
|
170 + MOVBZU -1(R4), R8 |
|
171 + MOVBZU R8, -1(R3) |
|
172 + BC 16, 0, backwardtailloop |
|
173 + |
|
174 +nobackwardtail: |
|
175 + BC 4, 6, backwardlarge // "BNE CR1" |
|
176 + RET |
|
177 + |
|
178 +backwardlarge: |
|
179 + MOVD R6, CTR |
|
180 + |
|
181 +backwardlargeloop: |
|
182 + MOVDU -8(R4), R8 |
|
183 + MOVDU R8, -8(R3) |
|
184 + BC 16, 0, backwardlargeloop // "BDNZ" |
|
185 RET |
|
186 -- |
|
187 2.6.1 |
|
188 |