/*
 * Copyright (c) 2006, Advanced Micro Devices, Inc.
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions are 
 * met:
 * 
 *    * Redistributions of source code must retain the above copyright 
 *      notice, this list of conditions and the following disclaimer.
 *    * Neither the name of Advanced Micro Devices, Inc. nor the names
 *      of its contributors may be used to endorse or promote products
 *      derived from this software without specific prior written
 *      permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <sysdep.h>
#include "asm-syntax.h"
#include "bp-sym.h"
#include "bp-asm.h"

# Author: John M. Zulauf, Advanced Micro Device, Inc.
# This is an optimization of lx_memcpy_movq with the following improvements
#	preamble was optimized by permuting all possible combinations
#	a "short buffer" escape (copy_bytes) was added

.section .text
	.p2align 2,,3
#if defined PIC && !defined NOT_IN_libc
ENTRY (__memcpy_chk)
	movl	12(%esp), %eax
	cmpl	%eax, 16(%esp)
	jb	HIDDEN_JUMPTARGET (__chk_fail)
END (__memcpy_chk)
#endif
ENTRY (BP_SYM (memcpy))	
# Function header (optimized through permutation search
	movl	%edi, %edx
	movl	0x04(%esp), %edi
	cld
	pushl	%esi
	movl	0x10(%esp), %ecx
	movl	0x0c(%esp), %esi

# We are ready for simple_copy *HERE*
	cmp	$15, %ecx
	jbe	copy_bytes

	
	# SI says: DON'T prefetch (%esi) 
	# --> it will never be in time and takes a slot
	prefetch 32(%esi)
	
# Compute the number of 64 byte copies to do
# -- for < 64 bytes, only simple_copy runs
	movl	%ecx, %eax
	shrl	$6, %eax
	jz	simple_copy # bail if none

# Note %edi, %esi, and are set up for a repmov here 

	prefetch	64(%esi)
	.p2align 2,,3		# dword align loop
two_cacheline_loop:
	# SI says: only three prefetch slots ... and one is taken above

	movq 0x00(%esi), %mm0
	prefetch 0x60(%esi)
	movq 0x08(%esi), %mm1
	movq 0x10(%esi), %mm2
	movq 0x18(%esi), %mm3

	movq %mm0, 0x00(%edi)
	movq %mm1, 0x08(%edi)
	movq %mm2, 0x10(%edi)
	movq %mm3, 0x18(%edi)

	movq 0x20(%esi), %mm4
	prefetch 0x80(%esi)
	movq 0x28(%esi), %mm5
	movq 0x30(%esi), %mm6
	movq 0x38(%esi), %mm7

	movq %mm4, 0x20(%edi)
	movq %mm5, 0x28(%edi)
	movq %mm6, 0x30(%edi)
	movq %mm7, 0x38(%edi)

	addl $0x40, %esi
	addl $0x40, %edi
	

	decl	%eax
	jne	two_cacheline_loop
	emms

	# %edx need to be the length at the start of cacheline_aligned
	andl	$0x3F, %ecx
	jz	common_return # bail on zero

	#.p2align 2,,3		# dword align simple_copy
simple_copy:
# Move the words then move the bytes	
# Assumes %ecx, %edi, and %esi are good to go
	movb	%cl, %al
	shrl	$2,%ecx
	rep
	movsl
	movb	 %al, %cl
	andb	$3, %cl
copy_bytes:
	rep
	movsb

common_return:
# need to set eax to dest
	popl %esi
	movl %edx, %edi
	movl 0x4(%esp),%eax #set the return to dst
	ret
END (BP_SYM (memcpy))
libc_hidden_builtin_def (memcpy)
