usr/src/lib/libc/i386_hwcap1/gen/memset.s
author Mark J. Nelson <Mark.J.Nelson@Sun.COM>
Wed, 06 Aug 2008 16:29:39 -0600
changeset 7298 b69e27387f74
parent 6812 febeba71273d
permissions -rw-r--r--
6733918 Teamware has retired, please welcome your new manager, Mercurial 4758439 some files use "current date" sccs keywords 6560843 asm sources should not rely on .file "%M%" for naming STT_FILE symbols 6560958 Solaris:: perl modules should not use SCCS keywords in version information 6729074 webrev doesn't deal well with remote ssh hg parents

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

	.file	"memset.s"

#include <sys/asm_linkage.h>

	ANSI_PRAGMA_WEAK(memset,function)

	ENTRY(memset)
	pushl	%edi		/ save register variable
	movl	8(%esp),%edi	/ %edi = string address
	movl	12(%esp),%eax	/ %al = byte to duplicate
	movl	16(%esp),%ecx	/ %ecx = number of copies

	/ For all basic blocks in this routine, maintain the following
	/ entry conditions:	%eax each byte is set to desired byte.
	/			NOTE: .byteset doesn't require this
	/			%ecx contains # bytes to set
	/			%edi contain address to set

	cld			/ make sure we go the right way...
	cmpl	$20,%ecx	/ strings with fewer than 20 chars should be byte set
	jbe	.byteset	

	andl	$0xff, %eax	/ trim anything above low byte
	imul	$0x01010101, %eax	/ extend low byte to each byte
	
	cmpl	$256, %ecx	/ smaller areas don't benefit from alignment
	jbe	.wordset

	cmpl	$511, %ecx	/ areas smaller than this should be wordset
	jbe	.check_wordset	

	/
	/ prep work for sse temporal and non-temporal
	/

	pushl	%ebx		/ more registers are needed
	pushl	%esi		/ for alignment work

	/
	/ align address to 64 byte boundaries.
	/

	movl	%ecx, %ebx	/ save byte count
	movl	%edi, %esi	/ esi is scratch register
	andl	$63, %esi	/ bytes to align to 64 byte align addr
	neg	%esi		/ compute count of bytes 
	addl	$64, %esi	/ needed to align
	andl	$63, %esi	/ to 64 byte align addr
	jz	.sse_aligned	/ skip alignment if not needed
	subl	%esi, %ebx	/ ebx contains remainder of bytes to set
	movl	%esi, %ecx	/ alignment bytes
	shrl	$2,%ecx		/ %ecx = number of words to set
	rep; sstol
	movl	%esi,%ecx
	andl	$3,%ecx		/ %ecx = number of bytes left
	rep; sstob
	movl	%ebx, %ecx	/ remainder to be set

.sse_aligned:
	
	shr	$6, %ecx	/ number of 64 byte blocks to set

	/
	/ load xmm0 with bytes to be set
	/
	subl	$16,%esp	/ give ourselves some working room on the stack
	movl	%eax,(%esp)	/ copy eax into each of 4 bytes
	movl	%eax,4(%esp)	/ avoid pushl since it causes more interlocking
	movl	%eax,8(%esp)	/
	movl	%eax,12(%esp)	/
	movups	(%esp), %xmm0	/ unaligned load from stack into xmm0
	addl	$16,%esp	/ restore stack position
	
	cmpl	$262143, %ebx	/ blocks smaller than this allocate in the cache
	jbe	.sse_loop
	jmp	.sse_nt_loop	/ branch across alignment nops
		
	.align 16

.sse_nt_loop:	
	movntps %xmm0, (%edi)	/ block non-temporal store
	movntps %xmm0, 16(%edi)	/ use sse rather than sse2
	movntps %xmm0, 32(%edi)	/ so we work more places
	movntps %xmm0, 48(%edi)	/

	addl	$64, %edi	/ increment dest address
	dec	%ecx		/ dec count of blocks
	jnz	.sse_nt_loop	/ jump if not done

	andl	$63, %ebx	/ remainder of bytes to copy
	movl	%ebx, %ecx	/ ecx contains remainer of bytes to set
	popl	%esi		/ restore stack config
	popl	%ebx		/
#if defined(_SSE2_INSN)
	mfence
#elif defined(_SSE_INSN)
	sfence
#else
#error "Must have either SSE or SSE2"
#endif
	cmpl	$20, %ecx	/ compare and jump accordingly
	jbe	.byteset
	jmp	.wordset	

	.align 16
.sse_loop:
 	movaps %xmm0, (%edi)	/ block copy w/ SSE
	movaps %xmm0, 16(%edi)
	movaps %xmm0, 32(%edi)
	movaps %xmm0, 48(%edi)

	addl	$64, %edi	/ increment addr
	dec	%ecx		/ dec count of blocks
	jnz	.sse_loop	/ jump if not done

	andl	$63, %ebx	/ remainder of bytes to copy
	movl	%ebx, %ecx	/ in %ecx as normal
	popl	%esi		/ restore stack config
	popl	%ebx		/
	cmpl	$20, %ecx	
	jbe	.byteset
	jmp	.wordset

.check_wordset:
	movl	%edi, %edx	/ save current store ptr
	andl	$7, %edi	/ check alignment
	movl	%edx,%edi	/ %edi = string address
	jz	.wordset	/ all ok 
	

.align_wordset:	
	pushl	%ebx		/ more registers are needed
	pushl	%esi		

	movl	%ecx, %ebx
	movl	%edi, %esi
	andl	$7, %esi
	neg	%esi
	addl	$8, %esi
	andl	$7, %esi
	subl	%esi, %ebx	/ ebx contains remainder of bytes to copy
	movl	%esi, %ecx
	rep; sstob	 
	movl	%ebx, %ecx
	popl	%esi		/ restore stack config
	popl	%ebx		/

.wordset:
	movl	%ecx, %edx	/ save cont
	shrl	$2,%ecx		/ %ecx = number of words to set
	rep; sstol
	movl	%edx,%ecx
	andl	$3,%ecx		/ %ecx = number of bytes left

.byteset:
	rep; sstob
	movl	8(%esp),%eax	/ return string address
	popl	%edi		/ restore register variable
	ret
	SET_SIZE(memset)