Name: Anonymous 2012-06-30 22:25
it does perform slower than windows and macosx versions..
bits 64
global intel_memcpy_aligned64
; void intel_memcpy_aligned64
; (void* dst, const void* src, unsigned long size)
; size is given in 64-bytes units.
; dst and src must be aligned on 16-bytes boundaries for maximum performance.
; movdqu is actually faster on aligned data than its aligned counterpart movdqa, at least on Sandy Bridge.
intel_memcpy_aligned64:
lea rsi,[rsi+40h]
lea rdi,[rdi+40h]
dec rdx
prefetchnta [rsi+180h]
movdqu xmm0,[rsi-40h]
movdqu xmm1,[rsi-30h]
cmp rdx,1
movntdq [rdi-40h],xmm0
movntdq [rdi-30h],xmm1
movdqu xmm2,[rsi-20h]
movdqu xmm3,[rsi-10h]
movntdq [rdi-20h],xmm2
movntdq [rdi-10h],xmm3
jge intel_memcpy_aligned64
ret