.data SprShift equ 4 SprMulti equ 16 .code StartSpr equ rcx EndSpr equ rdx pData equ r8 wBuffer equ r9 ;4 Param rbp+ [10h, 18h, 20h, 28h] InspLeft equ qword ptr[rbp+ 30h] InspTop equ qword ptr[rbp+ 38h] InspRight equ qword ptr[rbp+ 40h] InspBottom equ qword ptr[rbp+ 48h] DPC_SUPP equ qword ptr[rbp+ 50h] iy equ r10 rrImgAddr equ r11 rrPitch equ r12 rrPitch2 equ r13 rrSSpr equ r14 rrESpr equ r15 xxSupp2 equ xmm3 xxSrc equ xmm4 xxSrc2 equ xmm5 xxCmp equ xmm6 xxCmp2 equ xmm7 xxSpr equ xmm8 xxOne equ xmm9 xxfSpr equ xmm10 xxsSpr equ xmm11 xxMinus equ xmm12 xxMinus2 equ xmm13 xxRslt equ xmm14 CheckPitchAsmFast PROC local Result64[2] :qword local ImgAddrLine : qword local ImgLineEnd : qword local ImgLineLength : qword local iPitch : qword local SPR : qword local fSpr : qword push rbp mov rbp, rsp push r10 push r11 push r12 push r13 push r14 push r15 push rdi push rsi push rbx ADD rbp, 030h mov rrPitch, iPitch mov rrPitch2, rrPitch add rrPitch2, 1h mov rax, 1 movd xxOne, rax ; 1·Î ä¿î´Ù.(word) movdqa xmm0, xxOne punpcklwd xmm0, xxOne pshufd xxOne, xmm0, 0 mov rax, SPR movd xxSpr, rax movdqa xmm0, xxSpr punpcklwd xmm0, xxSpr pshufd xxSpr, xmm0, 0 mov rax, fSpr movd xxfSpr, rax movdqa xmm0, xxfSpr punpcklwd xmm0, xxfSpr pshufd xxfSpr, xmm0, 0 movdqa xxsSpr, xxSpr psubw xxsSpr, xxfSpr mov rax, 32 ; 2Çȼ¿*16spr= 32 mul DPC_SUPP movd xxSupp2, rax movdqa xmm0, xxSupp2 punpcklwd xmm0, xxSupp2 pshufd xxSupp2, xmm0, 0 ; À̹ÌÁö ÁÖ¼Ò ¸¸µé±â mov rrImgAddr, pData mov rax, wBuffer ; ImgAddr= ImgAddr+ BuffWidth*InspTop + InspLeft; mul InspTop add rax, InspLeft add rrImgAddr, rax mov ImgAddrLine, rrImgAddr mov rax, InspRight sub rax, InspLeft mov ImgLineLength, rax add rax, rrImgAddr mov ImgLineEnd, rax mov iy, InspTop pxor xxRslt, xxRslt pxor xmm0, xmm0 Cmp_16: ; 1. Src Buffer movdqu xxSrc, [rrImgAddr] ; Load movdqa xxSrc2, xxSrc punpcklbw xxSrc, xmm0 ; Unpack punpckhbw xxSrc2, xmm0 movdqu xxMinus, [rrImgAddr+ 1] ; Load movdqa xxMinus2, xxMinus punpcklbw xxMinus, xmm0 ; Unpack punpckhbw xxMinus2, xmm0 paddw xxSrc, xxMinus paddw xxSrc2, xxMinus2 pmullw xxSrc, xxSpr ; SPR pmullw xxSrc2, xxSpr ; 2. Compare Buffer movdqu xxCmp, [rrImgAddr+ rrPitch] ; Load movdqa xxCmp2, xxCmp punpcklbw xxCmp, xmm0 ; Unpack punpckhbw xxCmp2, xmm0 pmullw xxCmp, xxfSpr ; fSpr pmullw xxCmp2, xxfSpr psubw xxSrc, xxCmp ; SPR - fSpr psubw xxSrc2, xxCmp2 movdqu xxCmp, [rrImgAddr+ rrPitch2] movdqa xxCmp2, xxCmp punpcklbw xxCmp, xmm0 punpckhbw xxCmp2, xmm0 pmullw xxCmp, xxSpr pmullw xxCmp2, xxSpr psubw xxSrc, xxCmp psubw xxSrc2, xxCmp2 movdqu xxCmp, [rrImgAddr+ rrPitch2+ 1] ; Load movdqa xxCmp2, xxCmp punpcklbw xxCmp, xmm0 ; Unpack punpckhbw xxCmp2, xmm0 pmullw xxCmp, xxsSpr ; sSpr pmullw xxCmp2, xxsSpr psubw xxSrc, xxCmp ; (SPR- fSpr) - sSpr psubw xxSrc2, xxCmp2 pabsw xxSrc, xxSrc ; Àý´ë°ª psubusw xxSrc, xxSupp2 pmaddwd xxSrc, xxOne ; multiply and add ( 16 -> 32) pabsw xxSrc2, xxSrc2 ; Àý´ë°ª psubusw xxSrc2, xxSupp2 pmaddwd xxSrc2, xxOne ; multiply and add ( 16 -> 32) movdqa xxCmp, xxSrc punpckldq xxCmp, xmm0 ; unpack 32 -> 64 punpckhdq xxSrc, xmm0 paddq xxRslt, xxSrc ; add to Result(64) paddq xxRslt, xxCmp ; add to Result(64) movdqa xxCmp, xxSrc2 punpckldq xxCmp, xmm0 ; unpack 32 -> 64 punpckhdq xxSrc2, xmm0 paddq xxRslt, xxSrc2 ; add to Result(64) paddq xxRslt, xxCmp ; add to Result(64) add rrImgAddr, 10h cmp rrImgAddr, ImgLineEnd jl Cmp_16 inc iy cmp iy, InspBottom jge End_Buff mov rax, ImgAddrLine add rax, wBuffer mov ImgAddrLine, rax mov rrImgAddr, rax add rax, ImgLineLength mov ImgLineEnd, rax jmp Cmp_16 End_Buff: lea rcx, Result64 movdqu [rcx], xxRslt mov rax, [rcx] add rax, [rcx+8h] pop rbx pop rsi pop rdi pop r15 pop r14 pop r13 pop r12 pop r11 pop r10 pop rbp RET CheckPitchAsmFast ENDP ix equ rsi iy equ rdi rrBW equ r9 rrImgAddr equ r10 rrNoise equ r11 ; Noise Value at Current Pitch rrLeast equ r12 ; Least Noise Value rrPitch equ r13 ; Pitch at Least Noise Value xxTemp equ xmm1 xxOne equ xmm11 xxSrc equ xmm2 xxSrc2 equ xmm3 xxCmp equ xmm4 xxRef0 equ xmm5 xxRef1 equ xmm6 xxRef2 equ xmm7 xxRef3 equ xmm8 xxZeroBack equ xmm10 xxNoise equ xmm15 end