components/golang/patches/0028-release-branch.go1.5-runtime-adjust-the-ppc64x-memmo.patch
changeset 5331 9c955076ffe3
equal deleted inserted replaced
5330:c36e3195e3e9 5331:9c955076ffe3
       
     1 From 9f59bc85a2c9932caebf0b6c6a282a0e839d62af Mon Sep 17 00:00:00 2001
       
     2 From: Michael Hudson-Doyle <[email protected]>
       
     3 Date: Tue, 22 Sep 2015 22:35:52 +1200
       
     4 Subject: [PATCH 28/63] [release-branch.go1.5] runtime: adjust the ppc64x
       
     5  memmove and memclr to copy by word as much as it can
       
     6 MIME-Version: 1.0
       
     7 Content-Type: text/plain; charset=UTF-8
       
     8 Content-Transfer-Encoding: 8bit
       
     9 
       
    10 Issue #12552 can happen on ppc64 too, although much less frequently in my
       
    11 testing. I'm fairly sure this fixes it (2 out of 200 runs of oracle.test failed
       
    12 without this change and 0 of 200 failed with it). It's also a lot faster for
       
    13 large moves/clears:
       
    14 
       
    15 name           old speed      new speed       delta
       
    16 Memmove1-6      157MB/s ± 9%    144MB/s ± 0%    -8.20%         (p=0.004 n=10+9)
       
    17 Memmove2-6      281MB/s ± 1%    249MB/s ± 1%   -11.53%        (p=0.000 n=10+10)
       
    18 Memmove3-6      376MB/s ± 1%    328MB/s ± 1%   -12.64%        (p=0.000 n=10+10)
       
    19 Memmove4-6      475MB/s ± 4%    345MB/s ± 1%   -27.28%         (p=0.000 n=10+8)
       
    20 Memmove5-6      540MB/s ± 1%    393MB/s ± 0%   -27.21%        (p=0.000 n=10+10)
       
    21 Memmove6-6      609MB/s ± 0%    423MB/s ± 0%   -30.56%         (p=0.000 n=9+10)
       
    22 Memmove7-6      659MB/s ± 0%    468MB/s ± 0%   -28.99%         (p=0.000 n=8+10)
       
    23 Memmove8-6      705MB/s ± 0%   1295MB/s ± 1%   +83.73%          (p=0.000 n=9+9)
       
    24 Memmove9-6      740MB/s ± 1%   1241MB/s ± 1%   +67.61%         (p=0.000 n=10+8)
       
    25 Memmove10-6     780MB/s ± 0%   1162MB/s ± 1%   +48.95%         (p=0.000 n=10+9)
       
    26 Memmove11-6     811MB/s ± 0%   1180MB/s ± 0%   +45.58%          (p=0.000 n=8+9)
       
    27 Memmove12-6     820MB/s ± 1%   1073MB/s ± 1%   +30.83%         (p=0.000 n=10+9)
       
    28 Memmove13-6     849MB/s ± 0%   1068MB/s ± 1%   +25.87%        (p=0.000 n=10+10)
       
    29 Memmove14-6     877MB/s ± 0%    911MB/s ± 0%    +3.83%        (p=0.000 n=10+10)
       
    30 Memmove15-6     893MB/s ± 0%    922MB/s ± 0%    +3.25%         (p=0.000 n=10+9)
       
    31 Memmove16-6     897MB/s ± 1%   2418MB/s ± 1%  +169.67%         (p=0.000 n=10+9)
       
    32 Memmove32-6     908MB/s ± 0%   3927MB/s ± 2%  +332.64%         (p=0.000 n=10+8)
       
    33 Memmove64-6    1.11GB/s ± 0%   5.59GB/s ± 0%  +404.64%          (p=0.000 n=9+9)
       
    34 Memmove128-6   1.25GB/s ± 0%   6.71GB/s ± 2%  +437.49%         (p=0.000 n=9+10)
       
    35 Memmove256-6   1.33GB/s ± 0%   7.25GB/s ± 1%  +445.06%        (p=0.000 n=10+10)
       
    36 Memmove512-6   1.38GB/s ± 0%   8.87GB/s ± 0%  +544.43%        (p=0.000 n=10+10)
       
    37 Memmove1024-6  1.40GB/s ± 0%  10.00GB/s ± 0%  +613.80%        (p=0.000 n=10+10)
       
    38 Memmove2048-6  1.41GB/s ± 0%  10.65GB/s ± 0%  +652.95%         (p=0.000 n=9+10)
       
    39 Memmove4096-6  1.42GB/s ± 0%  11.01GB/s ± 0%  +675.37%         (p=0.000 n=8+10)
       
    40 Memclr5-6       269MB/s ± 1%    264MB/s ± 0%    -1.80%        (p=0.000 n=10+10)
       
    41 Memclr16-6      600MB/s ± 0%    887MB/s ± 1%   +47.83%        (p=0.000 n=10+10)
       
    42 Memclr64-6     1.06GB/s ± 0%   2.91GB/s ± 1%  +174.58%         (p=0.000 n=8+10)
       
    43 Memclr256-6    1.32GB/s ± 0%   6.58GB/s ± 0%  +399.86%         (p=0.000 n=9+10)
       
    44 Memclr4096-6   1.42GB/s ± 0%  10.90GB/s ± 0%  +668.03%         (p=0.000 n=8+10)
       
    45 Memclr65536-6  1.43GB/s ± 0%  11.37GB/s ± 0%  +697.83%          (p=0.000 n=9+8)
       
    46 GoMemclr5-6     359MB/s ± 0%    360MB/s ± 0%    +0.46%        (p=0.000 n=10+10)
       
    47 GoMemclr16-6    750MB/s ± 0%   1264MB/s ± 1%   +68.45%        (p=0.000 n=10+10)
       
    48 GoMemclr64-6   1.17GB/s ± 0%   3.78GB/s ± 1%  +223.58%         (p=0.000 n=10+9)
       
    49 GoMemclr256-6  1.35GB/s ± 0%   7.47GB/s ± 0%  +452.44%        (p=0.000 n=10+10)
       
    50 
       
    51 Update #12552
       
    52 
       
    53 Change-Id: I7192e9deb9684a843aed37f58a16a4e29970e893
       
    54 Reviewed-on: https://go-review.googlesource.com/14840
       
    55 Reviewed-by: Minux Ma <[email protected]>
       
    56 Reviewed-on: https://go-review.googlesource.com/16907
       
    57 Reviewed-by: Russ Cox <[email protected]>
       
    58 ---
       
    59  src/runtime/memclr_ppc64x.s  | 17 ++++++++--
       
    60  src/runtime/memmove_ppc64x.s | 78 +++++++++++++++++++++++++++++++++++---------
       
    61  2 files changed, 77 insertions(+), 18 deletions(-)
       
    62 
       
    63 diff --git a/src/runtime/memclr_ppc64x.s b/src/runtime/memclr_ppc64x.s
       
    64 index cea42cb..90e2748 100644
       
    65 --- a/src/runtime/memclr_ppc64x.s
       
    66 +++ b/src/runtime/memclr_ppc64x.s
       
    67 @@ -10,11 +10,22 @@
       
    68  TEXT runtime·memclr(SB),NOSPLIT,$0-16
       
    69  	MOVD	ptr+0(FP), R3
       
    70  	MOVD	n+8(FP), R4
       
    71 -	CMP	R4, $0
       
    72 +	SRADCC	$3, R4, R6	// R6 is the number of words to zero
       
    73 +	BEQ	bytes
       
    74 +
       
    75 +	SUB	$8, R3
       
    76 +	MOVD	R6, CTR
       
    77 +	MOVDU	R0, 8(R3)
       
    78 +	BC	25, 0, -1(PC)	// bdnz+ $-4
       
    79 +	ADD	$8, R3
       
    80 +
       
    81 +bytes:
       
    82 +	ANDCC	$7, R4, R7	// R7 is the number of bytes to zero
       
    83  	BEQ	done
       
    84  	SUB	$1, R3
       
    85 -	MOVD	R4, CTR
       
    86 +	MOVD	R7, CTR
       
    87  	MOVBU	R0, 1(R3)
       
    88 -	BC	25, 0, -1(PC) // bdnz+ $-4
       
    89 +	BC	25, 0, -1(PC)	// bdnz+ $-4
       
    90 +
       
    91  done:
       
    92  	RET
       
    93 diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
       
    94 index 3ada63e..72c90de 100644
       
    95 --- a/src/runtime/memmove_ppc64x.s
       
    96 +++ b/src/runtime/memmove_ppc64x.s
       
    97 @@ -16,25 +16,73 @@ TEXT runtime·memmove(SB), NOSPLIT, $-8-24
       
    98  	RET
       
    99  
       
   100  check:
       
   101 -	CMP	R3, R4
       
   102 -	BGT	backward
       
   103 +	ANDCC	$7, R5, R7	// R7 is the number of bytes to copy and CR0[EQ] is set if there are none.
       
   104 +	SRAD	$3, R5, R6	// R6 is the number of words to copy
       
   105 +	CMP	R6, $0, CR1	// CR1[EQ] is set if there are no words to copy.
       
   106  
       
   107 +	CMP	R3, R4, CR2
       
   108 +	BC	12, 9, backward	// I think you should be able to write this as "BGT CR2, backward"
       
   109 +
       
   110 +	// Copying forward proceeds by copying R6 words then copying R7 bytes.
       
   111 +	// R3 and R4 are advanced as we copy. Becuase PPC64 lacks post-increment
       
   112 +	// load/store, R3 and R4 point before the bytes that are to be copied.
       
   113 +
       
   114 +	BC	12, 6, noforwardlarge	// "BEQ CR1, noforwardlarge"
       
   115 +
       
   116 +	MOVD	R6, CTR
       
   117 +
       
   118 +	SUB	$8, R3
       
   119 +	SUB	$8, R4
       
   120 +
       
   121 +forwardlargeloop:
       
   122 +	MOVDU	8(R4), R8
       
   123 +	MOVDU	R8, 8(R3)
       
   124 +	BC	16, 0, forwardlargeloop // "BDNZ"
       
   125 +
       
   126 +	ADD	$8, R3
       
   127 +	ADD	$8, R4
       
   128 +
       
   129 +noforwardlarge:
       
   130 +	BNE	forwardtail	// Tests the bit set by ANDCC above
       
   131 +	RET
       
   132 +
       
   133 +forwardtail:
       
   134  	SUB	$1, R3
       
   135 -	ADD	R3, R5
       
   136  	SUB	$1, R4
       
   137 -loop:
       
   138 -	MOVBU	1(R4), R6
       
   139 -	MOVBU	R6, 1(R3)
       
   140 -	CMP	R3, R5
       
   141 -	BNE	loop
       
   142 +	MOVD	R7, CTR
       
   143 +
       
   144 +forwardtailloop:
       
   145 +	MOVBZU	1(R4), R8
       
   146 +	MOVBZU	R8, 1(R3)
       
   147 +	BC	16, 0, forwardtailloop
       
   148  	RET
       
   149  
       
   150  backward:
       
   151 -	ADD	R5, R4
       
   152 -	ADD	R3, R5
       
   153 -loop1:
       
   154 -	MOVBU	-1(R4), R6
       
   155 -	MOVBU	R6, -1(R5)
       
   156 -	CMP	R3, R5
       
   157 -	BNE	loop1
       
   158 +	// Copying backwards proceeds by copying R7 bytes then copying R6 words.
       
   159 +	// R3 and R4 are advanced to the end of the destination/source buffers
       
   160 +	// respectively and moved back as we copy.
       
   161 +
       
   162 +	ADD	R5, R4, R4
       
   163 +	ADD	R3, R5, R3
       
   164 +
       
   165 +	BEQ	nobackwardtail
       
   166 +
       
   167 +	MOVD	R7, CTR
       
   168 +
       
   169 +backwardtailloop:
       
   170 +	MOVBZU	-1(R4), R8
       
   171 +	MOVBZU	R8, -1(R3)
       
   172 +	BC	16, 0, backwardtailloop
       
   173 +
       
   174 +nobackwardtail:
       
   175 +	BC	4, 6, backwardlarge		// "BNE CR1"
       
   176 +	RET
       
   177 +
       
   178 +backwardlarge:
       
   179 +	MOVD	R6, CTR
       
   180 +
       
   181 +backwardlargeloop:
       
   182 +	MOVDU	-8(R4), R8
       
   183 +	MOVDU	R8, -8(R3)
       
   184 +	BC	16, 0, backwardlargeloop	// "BDNZ"
       
   185  	RET
       
   186 -- 
       
   187 2.6.1
       
   188