section .text global memcpyNTprefetch memcpyNTprefetch: ; preamble push ebp mov ebp, esp mov esi, src mov ecx, nbytes mov ebx, ecx shr ebx, 11 ; 2048 bytes at a time mov edi, dest loop2k: ; Copy 2k into temporary buffer push edi mov edi, tbuf mov ecx, 2048 shr ecx, 6 loopMemToL1: prefetchnta 64[ESI] ; Prefetch next loop, non-temporal prefetchnta 96[ESI] movq mm1, 0[ESI] ; Read in source data movq mm2, 8[ESI] movq mm3, 16[ESI] movq mm4, 24[ESI] movq mm5, 32[ESI] movq mm6, 40[ESI] movq mm7, 48[ESI] movq mm0, 56[ESI] movq 0[EDI], mm1 ; Store into L1 movq 8[EDI], mm2 movq 16[EDI], mm3 movq 24[EDI], mm4 movq 32[EDI], mm5 movq 40[EDI], mm6 movq 48[EDI], mm7 movq 56[EDI], mm0 add esi, 64 add edi, 64 dec ecx jnz loopMemToL1 pop edi ; Now copy from L1 to system memory push esi mov esi, tbuf mov ecx, 2048 shr ecx, 6 loopL1ToMem: movq mm1, 0[ESI] ; Read in source data from L1 movq mm2, 8[ESI] movq mm3, 16[ESI] movq mm4, 24[ESI] movq mm5, 32[ESI] movq mm6, 40[ESI] movq mm7, 48[ESI] movq mm0, 56[ESI] movntq 0[EDI], mm1 ; Non-temporal stores movntq 8[EDI], mm2 movntq 16[EDI], mm3 movntq 24[EDI], mm4 movntq 32[EDI], mm5 movntq 40[EDI], mm6 movntq 48[EDI], mm7 movntq 56[EDI], mm0 add esi, 64 add edi, 64 dec ecx jnz loopL1ToMem pop esi ; Do next 2k block dec ebx jnz loop2k ; postamble mov esp, ebp pop ebp ret