global intel_memcpy_aligned64
; void intel_memcpy_aligned64
; (void* dst, const void* src, unsigned long size)
; size is given in 64-bytes units.
; dst and src must be aligned on 16-bytes boundaries for maximum performance.
; movdqu is actually faster on aligned data than its aligned counterpart movdqa, at least on Sandy Bridge.
intel_memcpy_aligned64:
lea rsi,[rsi+40h]
lea rdi,[rdi+40h]
dec rdx
prefetchnta [rsi+180h]
movdqu xmm0,[rsi-40h]
movdqu xmm1,[rsi-30h]
cmp rdx,1
movntdq [rdi-40h],xmm0
movntdq [rdi-30h],xmm1
movdqu xmm2,[rsi-20h]
movdqu xmm3,[rsi-10h]
movntdq [rdi-20h],xmm2
movntdq [rdi-10h],xmm3
jge intel_memcpy_aligned64
ret
>>11
It's the fastest for big transfers to unlikely cached memory. Perfect for submitting vertex data to the GPU for instance.
Name:
Anonymous2012-07-01 13:56
are add instructions multithreaded please help me /prog/
Name:
Anonymous2012-07-01 14:39
>>13
Yes. CPU scheds them in parallel where possible.
Name:
Anonymous2012-07-01 15:19
>>6
You problem is that you were probably using pthread conditions improperly... they don't work like Windows semaphores/events. memcpy isn't single threaded on Linux.
Name:
Anonymous2012-07-01 15:20
>>6
Post your Linux code now, and we'll tell you where you fucked up.