Return Styles: Pseud0ch, Terminal, Valhalla, NES, Geocities, Blue Moon. Entire thread

Is linux memcpy multithreaded?

Name: Anonymous 2012-06-30 22:25

it does perform slower than windows and macosx versions..

Name: Anonymous 2012-06-30 23:07

>>1
memcpy is never multithreaded. You don't understanding multi-threading.

Are you on an Sandy or Ivy Bridge CPU?

Name: Anonymous 2012-07-01 0:44

>>2
Performance (10000x 4MB block memcpy):

 1 thread :  1826 MB/sec
 2 threads:  3118 MB/sec
 3 threads:  4121 MB/sec
 4 threads: 10020 MB/sec
 5 threads: 12848 MB/sec
 6 threads: 14340 MB/sec
 8 threads: 17892 MB/sec
10 threads: 21781 MB/sec
12 threads: 25721 MB/sec
14 threads: 25318 MB/sec
16 threads: 19965 MB/sec
24 threads: 13158 MB/sec
32 threads: 12497 MB/sec

Name: Anonymous 2012-07-01 1:11

>>3
Show your code and we'll tell you why you're full of shit.

Name: Anonymous 2012-07-01 1:46

jesus fucking christ we have amazing retards on this board

Name: Anonymous 2012-07-01 2:05

>>4

#define NUM_CPY_THREADS 4

HANDLE hCopyThreads[NUM_CPY_THREADS] = {0};
HANDLE hCopyStartSemaphores[NUM_CPY_THREADS] = {0};
HANDLE hCopyStopSemaphores[NUM_CPY_THREADS] = {0};
typedef struct
{
    int ct;
    void * src, * dest;
    size_t size;
} mt_cpy_t;

mt_cpy_t mtParamters[NUM_CPY_THREADS] = {0};

DWORD WINAPI thread_copy_proc(LPVOID param)
{
    mt_cpy_t * p = (mt_cpy_t * ) param;

    while(1)
    {
        WaitForSingleObject(hCopyStartSemaphores[p->ct], INFINITE);
        memcpy(p->dest, p->src, p->size);
        ReleaseSemaphore(hCopyStopSemaphores[p->ct], 1, NULL);
    }

    return 0;
}

int startCopyThreads()
{
    for(int ctr = 0; ctr < NUM_CPY_THREADS; ctr++)
    {
        hCopyStartSemaphores[ctr] = CreateSemaphore(NULL, 0, 1, NULL);
        hCopyStopSemaphores[ctr] = CreateSemaphore(NULL, 0, 1, NULL);
        mtParamters[ctr].ct = ctr;
        hCopyThreads[ctr] = CreateThread(0, 0, thread_copy_proc, &mtParamters[ctr], 0, NULL);
    }

    return 0;
}

void * mt_memcpy(void * dest, void * src, size_t bytes)
{
    //set up parameters
    for(int ctr = 0; ctr < NUM_CPY_THREADS; ctr++)
    {
        mtParamters[ctr].dest = (char *) dest + ctr * bytes / NUM_CPY_THREADS;
        mtParamters[ctr].src = (char *) src + ctr * bytes / NUM_CPY_THREADS;
        mtParamters[ctr].size = (ctr + 1) * bytes / NUM_CPY_THREADS - ctr * bytes / NUM_CPY_THREADS;
    }

    //release semaphores to start computation
    for(int ctr = 0; ctr < NUM_CPY_THREADS; ctr++)
        ReleaseSemaphore(hCopyStartSemaphores[ctr], 1, NULL);

    //wait for all threads to finish
    WaitForMultipleObjects(NUM_CPY_THREADS, hCopyStopSemaphores, TRUE, INFINITE);

    return dest;
}

int stopCopyThreads()
{
    for(int ctr = 0; ctr < NUM_CPY_THREADS; ctr++)
    {
        TerminateThread(hCopyThreads[ctr], 0);
        CloseHandle(hCopyStartSemaphores[ctr]);
        CloseHandle(hCopyStopSemaphores[ctr]);
    }
    return 0;
}

Name: Anonymous 2012-07-01 3:20

>>6
HANDLE
stopped reading right there.

Name: Anonymous 2012-07-01 3:54

while(1)

What the fuck.

Name: Anonymous 2012-07-01 4:49

Name: Anonymous 2012-07-01 8:50

Strongest memcpy:
bits 64

global intel_memcpy_aligned64
; void intel_memcpy_aligned64
; (void* dst, const void* src, unsigned long size)
; size is given in 64-bytes units.
; dst and src must be aligned on 16-bytes boundaries for maximum performance.
; movdqu is actually faster on aligned data than its aligned counterpart movdqa, at least on Sandy Bridge.
intel_memcpy_aligned64:
        lea rsi,[rsi+40h]
        lea rdi,[rdi+40h]
        dec rdx
        prefetchnta [rsi+180h]
        movdqu xmm0,[rsi-40h]
        movdqu xmm1,[rsi-30h]
        cmp rdx,1
        movntdq [rdi-40h],xmm0
        movntdq [rdi-30h],xmm1
        movdqu xmm2,[rsi-20h]
        movdqu xmm3,[rsi-10h]
        movntdq [rdi-20h],xmm2
        movntdq [rdi-10h],xmm3
        jge intel_memcpy_aligned64
        ret

Name: Anonymous 2012-07-01 9:13

>>10
PIG DISGUSTING!

Name: Anonymous 2012-07-01 9:19

>>11
It's the fastest for big transfers to unlikely cached memory. Perfect for submitting vertex data to the GPU for instance.

Name: Anonymous 2012-07-01 13:56

are add instructions multithreaded please help me /prog/

Name: Anonymous 2012-07-01 14:39

>>13
Yes. CPU scheds them in parallel where possible.

Name: Anonymous 2012-07-01 15:19

>>6
You problem is that you were probably using pthread conditions improperly... they don't work like Windows semaphores/events. memcpy isn't single threaded on Linux.

Name: Anonymous 2012-07-01 15:20

>>6
Post your Linux code now, and we'll tell you where you fucked up.

Name: Anonymous 2012-07-01 15:39

>>16
Linux is shit.

Name: Anonymous 2012-07-01 16:12

GOD DAMMIT
what are these retards
these retards
what are
RETARDS
DOING IN MY /PROG/

Name: Anonymous 2012-07-01 17:11

Name: Anonymous 2012-07-02 6:14

Name: bampu pantsu 2012-07-06 4:51

bampu pantsu

Newer Posts
Don't change these.
Name: Email:
Entire Thread Thread List