WiscSort / EMS / header / memcpyNTprefetch.S
memcpyNTprefetch.S
Raw
section .text
    global memcpyNTprefetch

memcpyNTprefetch:
    ; preamble
    push ebp
    mov ebp, esp

            mov esi, src 
            mov ecx, nbytes 
            mov ebx, ecx 
            shr ebx, 11 ; 2048 bytes at a time 
            mov edi, dest 

    loop2k: ; Copy 2k into temporary buffer 
            push edi 
            mov edi, tbuf 
            mov ecx, 2048 
            shr ecx, 6 

    loopMemToL1: 
            prefetchnta 64[ESI] ; Prefetch next loop, non-temporal 
            prefetchnta 96[ESI] 

            movq mm1,  0[ESI] ; Read in source data 
            movq mm2,  8[ESI] 
            movq mm3, 16[ESI] 
            movq mm4, 24[ESI] 
            movq mm5, 32[ESI] 
            movq mm6, 40[ESI] 
            movq mm7, 48[ESI] 
            movq mm0, 56[ESI] 

            movq  0[EDI], mm1 ; Store into L1 
            movq  8[EDI], mm2 
            movq 16[EDI], mm3 
            movq 24[EDI], mm4 
            movq 32[EDI], mm5 
            movq 40[EDI], mm6 
            movq 48[EDI], mm7 
            movq 56[EDI], mm0 
            add esi, 64 
            add edi, 64 
            dec ecx 
            jnz loopMemToL1 

            pop edi ; Now copy from L1 to system memory 
            push esi 
            mov esi, tbuf 
            mov ecx, 2048 
            shr ecx, 6 

    loopL1ToMem: 
            movq mm1, 0[ESI] ; Read in source data from L1 
            movq mm2, 8[ESI] 
            movq mm3, 16[ESI] 
            movq mm4, 24[ESI] 
            movq mm5, 32[ESI] 
            movq mm6, 40[ESI] 
            movq mm7, 48[ESI] 
            movq mm0, 56[ESI] 

            movntq 0[EDI], mm1 ; Non-temporal stores 
            movntq 8[EDI], mm2 
            movntq 16[EDI], mm3 
            movntq 24[EDI], mm4 
            movntq 32[EDI], mm5 
            movntq 40[EDI], mm6 
            movntq 48[EDI], mm7 
            movntq 56[EDI], mm0 

            add esi, 64 
            add edi, 64 
            dec ecx 
            jnz loopL1ToMem 

            pop esi ; Do next 2k block 
            dec ebx 
            jnz loop2k 
    ; postamble
    mov esp, ebp
    pop ebp
    ret