x86-32.asm
上传用户:xjjlds
上传日期:2015-12-05
资源大小:22823k
文件大小:21k
- .686
- .model flat
- .mmx
- .xmm
- .const
- __uvmin DD 0d01502f9r ; -1e+010
- __uvmax DD 0501502f9r ; +1e+010
- .code
-
- ;
- ; memsetd
- ;
- @memsetd@12 proc public
- push edi
-
- mov edi, ecx
- mov eax, edx
- mov ecx, [esp+4+4]
- cld
- rep stosd
-
- pop edi
- ret 4
-
- @memsetd@12 endp
- ;
- ; SaturateColor
- ;
-
- @SaturateColor_sse2@4 proc public
- pxor xmm0, xmm0
- movdqa xmm1, [ecx]
- packssdw xmm1, xmm0
- packuswb xmm1, xmm0
- punpcklbw xmm1, xmm0
- punpcklwd xmm1, xmm0
- movdqa [ecx], xmm1
- ret
- @SaturateColor_sse2@4 endp
- @SaturateColor_asm@4 proc public
- push esi
- mov esi, ecx
- xor eax, eax
- mov edx, 000000ffh
- mov ecx, [esi]
- cmp ecx, eax
- cmovl ecx, eax
- cmp ecx, edx
- cmovg ecx, edx
- mov [esi], ecx
- mov ecx, [esi+4]
- cmp ecx, eax
- cmovl ecx, eax
- cmp ecx, edx
- cmovg ecx, edx
- mov [esi+4], ecx
- mov ecx, [esi+8]
- cmp ecx, eax
- cmovl ecx, eax
- cmp ecx, edx
- cmovg ecx, edx
- mov [esi+8], ecx
- mov ecx, [esi+12]
- cmp ecx, eax
- cmovl ecx, eax
- cmp ecx, edx
- cmovg ecx, edx
- mov [esi+12], ecx
-
- pop esi
-
- ret
-
- @SaturateColor_asm@4 endp
- ;
- ; swizzling
- ;
- punpck macro op, sd0, sd2, s1, s3, d1, d3
- movdqa @CatStr(xmm, %d1), @CatStr(xmm, %sd0)
- pshufd @CatStr(xmm, %d3), @CatStr(xmm, %sd2), 0e4h
-
- @CatStr(punpckl, op) @CatStr(xmm, %sd0), @CatStr(xmm, %s1)
- @CatStr(punpckh, op) @CatStr(xmm, %d1), @CatStr(xmm, %s1)
- @CatStr(punpckl, op) @CatStr(xmm, %sd2), @CatStr(xmm, %s3)
- @CatStr(punpckh, op) @CatStr(xmm, %d3), @CatStr(xmm, %s3)
- endm
-
- punpcknb macro
- movdqa xmm4, xmm0
- pshufd xmm5, xmm1, 0e4h
- psllq xmm1, 4
- psrlq xmm4, 4
- movdqa xmm6, xmm7
- pand xmm0, xmm7
- pandn xmm6, xmm1
- por xmm0, xmm6
- movdqa xmm6, xmm7
- pand xmm4, xmm7
- pandn xmm6, xmm5
- por xmm4, xmm6
- movdqa xmm1, xmm4
- movdqa xmm4, xmm2
- pshufd xmm5, xmm3, 0e4h
- psllq xmm3, 4
- psrlq xmm4, 4
- movdqa xmm6, xmm7
- pand xmm2, xmm7
- pandn xmm6, xmm3
- por xmm2, xmm6
- movdqa xmm6, xmm7
- pand xmm4, xmm7
- pandn xmm6, xmm5
- por xmm4, xmm6
- movdqa xmm3, xmm4
- punpck bw, 0, 2, 1, 3, 4, 6
- endm
- ;
- ; unSwizzleBlock32
- ;
- @unSwizzleBlock32_sse2@12 proc public
- push ebx
- mov ebx, [esp+4+4]
- lea eax, [ebx*2]
- add eax, ebx
- movdqa xmm0, [ecx+16*0]
- movdqa xmm1, [ecx+16*1]
- movdqa xmm2, [ecx+16*2]
- movdqa xmm3, [ecx+16*3]
- punpck qdq, 0, 2, 1, 3, 4, 6
- movdqa [edx], xmm0
- movdqa [edx+16], xmm2
- movdqa [edx+ebx], xmm4
- movdqa [edx+ebx+16], xmm6
- movdqa xmm0, [ecx+16*4]
- movdqa xmm1, [ecx+16*5]
- movdqa xmm2, [ecx+16*6]
- movdqa xmm3, [ecx+16*7]
- punpck qdq, 0, 2, 1, 3, 4, 6
- movdqa [edx+ebx*2], xmm0
- movdqa [edx+ebx*2+16], xmm2
- movdqa [edx+eax], xmm4
- movdqa [edx+eax+16], xmm6
-
- lea edx, [edx+ebx*4]
- movdqa xmm0, [ecx+16*8]
- movdqa xmm1, [ecx+16*9]
- movdqa xmm2, [ecx+16*10]
- movdqa xmm3, [ecx+16*11]
- punpck qdq, 0, 2, 1, 3, 4, 6
- movdqa [edx], xmm0
- movdqa [edx+16], xmm2
- movdqa [edx+ebx], xmm4
- movdqa [edx+ebx+16], xmm6
- movdqa xmm0, [ecx+16*12]
- movdqa xmm1, [ecx+16*13]
- movdqa xmm2, [ecx+16*14]
- movdqa xmm3, [ecx+16*15]
- punpck qdq, 0, 2, 1, 3, 4, 6
- movdqa [edx+ebx*2], xmm0
- movdqa [edx+ebx*2+16], xmm2
- movdqa [edx+eax], xmm4
- movdqa [edx+eax+16], xmm6
- pop ebx
- ret 4
- @unSwizzleBlock32_sse2@12 endp
- ;
- ; unSwizzleBlock16
- ;
- @unSwizzleBlock16_sse2@12 proc public
-
- push ebx
- mov ebx, [esp+4+4]
- mov eax, 4
-
- align 16
- @@:
- movdqa xmm0, [ecx+16*0]
- movdqa xmm1, [ecx+16*1]
- movdqa xmm2, [ecx+16*2]
- movdqa xmm3, [ecx+16*3]
- punpck wd, 0, 2, 1, 3, 4, 6
- punpck dq, 0, 4, 2, 6, 1, 3
- punpck wd, 0, 4, 1, 3, 2, 6
- movdqa [edx], xmm0
- movdqa [edx+16], xmm2
- movdqa [edx+ebx], xmm4
- movdqa [edx+ebx+16], xmm6
- add ecx, 64
- lea edx, [edx+ebx*2]
- dec eax
- jnz @B
-
- pop ebx
-
- ret 4
-
- @unSwizzleBlock16_sse2@12 endp
- ;
- ; unSwizzleBlock8
- ;
- @unSwizzleBlock8_sse2@12 proc public
- push ebx
- mov ebx, [esp+4+4]
- mov eax, 2
- align 16
- @@:
- ; col 0, 2
- movdqa xmm0, [ecx+16*0]
- movdqa xmm1, [ecx+16*1]
- movdqa xmm4, [ecx+16*2]
- movdqa xmm5, [ecx+16*3]
- punpck bw, 0, 4, 1, 5, 2, 6
- punpck wd, 0, 2, 4, 6, 1, 3
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 2, 4, 6, 1, 3
- pshufd xmm1, xmm1, 0b1h
- pshufd xmm3, xmm3, 0b1h
- movdqa [edx], xmm0
- movdqa [edx+ebx], xmm2
- lea edx, [edx+ebx*2]
- movdqa [edx], xmm1
- movdqa [edx+ebx], xmm3
- lea edx, [edx+ebx*2]
- ; col 1, 3
- movdqa xmm0, [ecx+16*4]
- movdqa xmm1, [ecx+16*5]
- movdqa xmm4, [ecx+16*6]
- movdqa xmm5, [ecx+16*7]
- punpck bw, 0, 4, 1, 5, 2, 6
- punpck wd, 0, 2, 4, 6, 1, 3
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 2, 4, 6, 1, 3
- pshufd xmm0, xmm0, 0b1h
- pshufd xmm2, xmm2, 0b1h
- movdqa [edx], xmm0
- movdqa [edx+ebx], xmm2
- lea edx, [edx+ebx*2]
- movdqa [edx], xmm1
- movdqa [edx+ebx], xmm3
- lea edx, [edx+ebx*2]
- add ecx, 128
- dec eax
- jnz @B
- pop ebx
-
- ret 4
- @unSwizzleBlock8_sse2@12 endp
- ;
- ; unSwizzleBlock4
- ;
- @unSwizzleBlock4_sse2@12 proc public
- push ebx
- mov eax, 0f0f0f0fh
- movd xmm7, eax
- pshufd xmm7, xmm7, 0
- mov ebx, [esp+4+4]
- mov eax, 2
- align 16
- @@:
- ; col 0, 2
- movdqa xmm0, [ecx+16*0]
- movdqa xmm1, [ecx+16*1]
- movdqa xmm4, [ecx+16*2]
- movdqa xmm3, [ecx+16*3]
- punpck dq, 0, 4, 1, 3, 2, 6
- punpck dq, 0, 2, 4, 6, 1, 3
- punpcknb
- punpck bw, 0, 2, 4, 6, 1, 3
- punpck wd, 0, 2, 1, 3, 4, 6
- pshufd xmm0, xmm0, 0d8h
- pshufd xmm2, xmm2, 0d8h
- pshufd xmm4, xmm4, 0d8h
- pshufd xmm6, xmm6, 0d8h
- punpck qdq, 0, 2, 4, 6, 1, 3
- pshuflw xmm1, xmm1, 0b1h
- pshuflw xmm3, xmm3, 0b1h
- pshufhw xmm1, xmm1, 0b1h
- pshufhw xmm3, xmm3, 0b1h
- movdqa [edx], xmm0
- movdqa [edx+ebx], xmm2
- lea edx, [edx+ebx*2]
- movdqa [edx], xmm1
- movdqa [edx+ebx], xmm3
- lea edx, [edx+ebx*2]
- ; col 1, 3
- movdqa xmm0, [ecx+16*4]
- movdqa xmm1, [ecx+16*5]
- movdqa xmm4, [ecx+16*6]
- movdqa xmm3, [ecx+16*7]
- punpck dq, 0, 4, 1, 3, 2, 6
- punpck dq, 0, 2, 4, 6, 1, 3
- punpcknb
- punpck bw, 0, 2, 4, 6, 1, 3
- punpck wd, 0, 2, 1, 3, 4, 6
- pshufd xmm0, xmm0, 0d8h
- pshufd xmm2, xmm2, 0d8h
- pshufd xmm4, xmm4, 0d8h
- pshufd xmm6, xmm6, 0d8h
- punpck qdq, 0, 2, 4, 6, 1, 3
- pshuflw xmm0, xmm0, 0b1h
- pshuflw xmm2, xmm2, 0b1h
- pshufhw xmm0, xmm0, 0b1h
- pshufhw xmm2, xmm2, 0b1h
- movdqa [edx], xmm0
- movdqa [edx+ebx], xmm2
- lea edx, [edx+ebx*2]
- movdqa [edx], xmm1
- movdqa [edx+ebx], xmm3
- lea edx, [edx+ebx*2]
- add ecx, 128
- dec eax
- jnz @B
- pop ebx
-
- ret 4
- @unSwizzleBlock4_sse2@12 endp
- ;
- ; unSwizzleBlock8HP
- ;
- @unSwizzleBlock8HP_sse2@12 proc public
- push ebx
- mov ebx, [esp+4+4]
- mov eax, 4
- align 16
- @@:
- movdqa xmm0, [ecx+16*0]
- movdqa xmm1, [ecx+16*1]
- movdqa xmm2, [ecx+16*2]
- movdqa xmm3, [ecx+16*3]
- punpck qdq, 0, 2, 1, 3, 4, 6
-
- psrld xmm0, 24
- psrld xmm2, 24
- psrld xmm4, 24
- psrld xmm6, 24
-
- packssdw xmm0, xmm2
- packssdw xmm4, xmm6
- packuswb xmm0, xmm4
- movlps qword ptr [edx], xmm0
- movhps qword ptr [edx+ebx], xmm0
- add ecx, 64
- lea edx, [edx+ebx*2]
- dec eax
- jnz @B
- pop ebx
- ret 4
- @unSwizzleBlock8HP_sse2@12 endp
- ;
- ; unSwizzleBlock4HLP
- ;
- @unSwizzleBlock4HLP_sse2@12 proc public
- push ebx
-
- mov eax, 0f0f0f0fh
- movd xmm7, eax
- pshufd xmm7, xmm7, 0
- mov ebx, [esp+4+4]
- mov eax, 4
-
- align 16
- @@:
- movdqa xmm0, [ecx+16*0]
- movdqa xmm1, [ecx+16*1]
- movdqa xmm2, [ecx+16*2]
- movdqa xmm3, [ecx+16*3]
- punpck qdq, 0, 2, 1, 3, 4, 6
-
- psrld xmm0, 24
- psrld xmm2, 24
- psrld xmm4, 24
- psrld xmm6, 24
-
- packssdw xmm0, xmm2
- packssdw xmm4, xmm6
- packuswb xmm0, xmm4
- pand xmm0, xmm7
- movlps qword ptr [edx], xmm0
- movhps qword ptr [edx+ebx], xmm0
- add ecx, 64
- lea edx, [edx+ebx*2]
- dec eax
- jnz @B
- pop ebx
- ret 4
- @unSwizzleBlock4HLP_sse2@12 endp
- ;
- ; unSwizzleBlock4HHP
- ;
- @unSwizzleBlock4HHP_sse2@12 proc public
- push ebx
- mov ebx, [esp+4+4]
- mov eax, 4
- align 16
- @@:
- movdqa xmm0, [ecx+16*0]
- movdqa xmm1, [ecx+16*1]
- movdqa xmm2, [ecx+16*2]
- movdqa xmm3, [ecx+16*3]
- punpck qdq, 0, 2, 1, 3, 4, 6
-
- psrld xmm0, 28
- psrld xmm2, 28
- psrld xmm4, 28
- psrld xmm6, 28
-
- packssdw xmm0, xmm2
- packssdw xmm4, xmm6
- packuswb xmm0, xmm4
- movlps qword ptr [edx], xmm0
- movhps qword ptr [edx+ebx], xmm0
- add ecx, 64
- lea edx, [edx+ebx*2]
- dec eax
- jnz @B
- pop ebx
- ret 4
- @unSwizzleBlock4HHP_sse2@12 endp
- ;
- ; unSwizzleBlock4P
- ;
- @unSwizzleBlock4P_sse2@12 proc public
- push esi
- push edi
- mov eax, 0f0f0f0fh
- movd xmm7, eax
- pshufd xmm7, xmm7, 0
- mov esi, [esp+4+8]
- lea edi, [esi*2]
- add edi, esi
- ; col 0
- movdqa xmm0, [ecx+16*0]
- movdqa xmm1, [ecx+16*1]
- movdqa xmm2, [ecx+16*2]
- movdqa xmm3, [ecx+16*3]
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck wd, 0, 4, 2, 6, 1, 3
- punpck bw, 0, 4, 1, 3, 2, 6
- movdqa xmm1, xmm7
- pandn xmm1, xmm0
- pand xmm0, xmm7
- pshufd xmm1, xmm1, 0b1h
- psrlq xmm1, 4
- movdqa xmm3, xmm7
- pandn xmm3, xmm2
- pand xmm2, xmm7
- pshufd xmm3, xmm3, 0b1h
- psrlq xmm3, 4
- movdqa [edx], xmm0
- movdqa [edx+16], xmm2
- movdqa [edx+esi*2], xmm1
- movdqa [edx+esi*2+16], xmm3
-
- movdqa xmm1, xmm7
- pandn xmm1, xmm4
- pand xmm4, xmm7
- pshufd xmm1, xmm1, 0b1h
- psrlq xmm1, 4
-
- movdqa xmm3, xmm7
- pandn xmm3, xmm6
- pand xmm6, xmm7
- pshufd xmm3, xmm3, 0b1h
- psrlq xmm3, 4
- movdqa [edx+esi], xmm4
- movdqa [edx+esi+16], xmm6
- movdqa [edx+edi], xmm1
- movdqa [edx+edi+16], xmm3
- lea edx, [edx+esi*4]
- ; col 1
- movdqa xmm0, [ecx+16*4]
- movdqa xmm1, [ecx+16*5]
- movdqa xmm2, [ecx+16*6]
- movdqa xmm3, [ecx+16*7]
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck wd, 0, 4, 2, 6, 1, 3
- punpck bw, 0, 4, 1, 3, 2, 6
- movdqa xmm1, xmm7
- pandn xmm1, xmm0
- pand xmm0, xmm7
- pshufd xmm0, xmm0, 0b1h
- psrlq xmm1, 4
- movdqa xmm3, xmm7
- pandn xmm3, xmm2
- pand xmm2, xmm7
- pshufd xmm2, xmm2, 0b1h
- psrlq xmm3, 4
- movdqa [edx], xmm0
- movdqa [edx+16], xmm2
- movdqa [edx+esi*2], xmm1
- movdqa [edx+esi*2+16], xmm3
-
- movdqa xmm1, xmm7
- pandn xmm1, xmm4
- pand xmm4, xmm7
- pshufd xmm4, xmm4, 0b1h
- psrlq xmm1, 4
-
- movdqa xmm3, xmm7
- pandn xmm3, xmm6
- pand xmm6, xmm7
- pshufd xmm6, xmm6, 0b1h
- psrlq xmm3, 4
- movdqa [edx+esi], xmm4
- movdqa [edx+esi+16], xmm6
- movdqa [edx+edi], xmm1
- movdqa [edx+edi+16], xmm3
- lea edx, [edx+esi*4]
- ; col 2
- movdqa xmm0, [ecx+16*8]
- movdqa xmm1, [ecx+16*9]
- movdqa xmm2, [ecx+16*10]
- movdqa xmm3, [ecx+16*11]
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck wd, 0, 4, 2, 6, 1, 3
- punpck bw, 0, 4, 1, 3, 2, 6
- movdqa xmm1, xmm7
- pandn xmm1, xmm0
- pand xmm0, xmm7
- pshufd xmm1, xmm1, 0b1h
- psrlq xmm1, 4
- movdqa xmm3, xmm7
- pandn xmm3, xmm2
- pand xmm2, xmm7
- pshufd xmm3, xmm3, 0b1h
- psrlq xmm3, 4
- movdqa [edx], xmm0
- movdqa [edx+16], xmm2
- movdqa [edx+esi*2], xmm1
- movdqa [edx+esi*2+16], xmm3
-
- movdqa xmm1, xmm7
- pandn xmm1, xmm4
- pand xmm4, xmm7
- pshufd xmm1, xmm1, 0b1h
- psrlq xmm1, 4
-
- movdqa xmm3, xmm7
- pandn xmm3, xmm6
- pand xmm6, xmm7
- pshufd xmm3, xmm3, 0b1h
- psrlq xmm3, 4
- movdqa [edx+esi], xmm4
- movdqa [edx+esi+16], xmm6
- movdqa [edx+edi], xmm1
- movdqa [edx+edi+16], xmm3
- lea edx, [edx+esi*4]
- ; col 3
- movdqa xmm0, [ecx+16*12]
- movdqa xmm1, [ecx+16*13]
- movdqa xmm2, [ecx+16*14]
- movdqa xmm3, [ecx+16*15]
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck wd, 0, 4, 2, 6, 1, 3
- punpck bw, 0, 4, 1, 3, 2, 6
- movdqa xmm1, xmm7
- pandn xmm1, xmm0
- pand xmm0, xmm7
- pshufd xmm0, xmm0, 0b1h
- psrlq xmm1, 4
- movdqa xmm3, xmm7
- pandn xmm3, xmm2
- pand xmm2, xmm7
- pshufd xmm2, xmm2, 0b1h
- psrlq xmm3, 4
- movdqa [edx], xmm0
- movdqa [edx+16], xmm2
- movdqa [edx+esi*2], xmm1
- movdqa [edx+esi*2+16], xmm3
-
- movdqa xmm1, xmm7
- pandn xmm1, xmm4
- pand xmm4, xmm7
- pshufd xmm4, xmm4, 0b1h
- psrlq xmm1, 4
-
- movdqa xmm3, xmm7
- pandn xmm3, xmm6
- pand xmm6, xmm7
- pshufd xmm6, xmm6, 0b1h
- psrlq xmm3, 4
- movdqa [edx+esi], xmm4
- movdqa [edx+esi+16], xmm6
- movdqa [edx+edi], xmm1
- movdqa [edx+edi+16], xmm3
- ; lea edx, [edx+esi*4]
- pop edi
- pop esi
- ret 4
- @unSwizzleBlock4P_sse2@12 endp
- ;
- ; swizzling
- ;
- ;
- ; SwizzleBlock32
- ;
- @SwizzleBlock32_sse2@16 proc public
- push esi
- push edi
- mov edi, ecx
- mov esi, edx
- mov edx, [esp+4+8]
- mov ecx, 4
- mov eax, [esp+8+8]
- cmp eax, 0ffffffffh
- jnz SwizzleBlock32_sse2@WM
- align 16
- @@:
- movdqa xmm0, [esi]
- movdqa xmm4, [esi+16]
- movdqa xmm1, [esi+edx]
- movdqa xmm5, [esi+edx+16]
- punpck qdq, 0, 4, 1, 5, 2, 6
- movdqa [edi+16*0], xmm0
- movdqa [edi+16*1], xmm2
- movdqa [edi+16*2], xmm4
- movdqa [edi+16*3], xmm6
- lea esi, [esi+edx*2]
- add edi, 64
- dec ecx
- jnz @B
- pop edi
- pop esi
- ret 8
- SwizzleBlock32_sse2@WM:
- movd xmm7, eax
- pshufd xmm7, xmm7, 0
-
- align 16
- @@:
- movdqa xmm0, [esi]
- movdqa xmm4, [esi+16]
- movdqa xmm1, [esi+edx]
- movdqa xmm5, [esi+edx+16]
- punpck qdq, 0, 4, 1, 5, 2, 6
- movdqa xmm3, xmm7
- pshufd xmm5, xmm7, 0e4h
- pandn xmm3, [edi+16*0]
- pand xmm0, xmm7
- por xmm0, xmm3
- movdqa [edi+16*0], xmm0
- pandn xmm5, [edi+16*1]
- pand xmm2, xmm7
- por xmm2, xmm5
- movdqa [edi+16*1], xmm2
- movdqa xmm3, xmm7
- pshufd xmm5, xmm7, 0e4h
- pandn xmm3, [edi+16*2]
- pand xmm4, xmm7
- por xmm4, xmm3
- movdqa [edi+16*2], xmm4
- pandn xmm5, [edi+16*3]
- pand xmm6, xmm7
- por xmm6, xmm5
- movdqa [edi+16*3], xmm6
- lea esi, [esi+edx*2]
- add edi, 64
- dec ecx
- jnz @B
- pop edi
- pop esi
- ret 8
-
- @SwizzleBlock32_sse2@16 endp
- ;
- ; SwizzleBlock16
- ;
- @SwizzleBlock16_sse2@12 proc public
- push ebx
- mov ebx, [esp+4+4]
- mov eax, 4
- align 16
- @@:
- movdqa xmm0, [edx]
- movdqa xmm1, [edx+16]
- movdqa xmm2, [edx+ebx]
- movdqa xmm3, [edx+ebx+16]
- punpck wd, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 4, 2, 6, 1, 5
- movdqa [ecx+16*0], xmm0
- movdqa [ecx+16*1], xmm1
- movdqa [ecx+16*2], xmm4
- movdqa [ecx+16*3], xmm5
- lea edx, [edx+ebx*2]
- add ecx, 64
- dec eax
- jnz @B
- pop ebx
- ret 4
- @SwizzleBlock16_sse2@12 endp
- ;
- ; SwizzleBlock8
- ;
- @SwizzleBlock8_sse2@12 proc public
- push ebx
- mov ebx, [esp+4+4]
- mov eax, 2
- align 16
- @@:
- ; col 0, 2
- movdqa xmm0, [edx]
- movdqa xmm2, [edx+ebx]
- lea edx, [edx+ebx*2]
- pshufd xmm1, [edx], 0b1h
- pshufd xmm3, [edx+ebx], 0b1h
- lea edx, [edx+ebx*2]
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck wd, 0, 2, 4, 6, 1, 3
- punpck qdq, 0, 1, 2, 3, 4, 5
- movdqa [ecx+16*0], xmm0
- movdqa [ecx+16*1], xmm4
- movdqa [ecx+16*2], xmm1
- movdqa [ecx+16*3], xmm5
- ; col 1, 3
- pshufd xmm0, [edx], 0b1h
- pshufd xmm2, [edx+ebx], 0b1h
- lea edx, [edx+ebx*2]
- movdqa xmm1, [edx]
- movdqa xmm3, [edx+ebx]
- lea edx, [edx+ebx*2]
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck wd, 0, 2, 4, 6, 1, 3
- punpck qdq, 0, 1, 2, 3, 4, 5
- movdqa [ecx+16*4], xmm0
- movdqa [ecx+16*5], xmm4
- movdqa [ecx+16*6], xmm1
- movdqa [ecx+16*7], xmm5
- add ecx, 128
- dec eax
- jnz @B
- pop ebx
- ret 4
-
- @SwizzleBlock8_sse2@12 endp
- ;
- ; SwizzleBlock4
- ;
- @SwizzleBlock4_sse2@12 proc public
- push ebx
-
- mov eax, 0f0f0f0fh
- movd xmm7, eax
- pshufd xmm7, xmm7, 0
- mov ebx, [esp+4+4]
- mov eax, 2
- align 16
- @@:
- ; col 0, 2
- movdqa xmm0, [edx]
- movdqa xmm2, [edx+ebx]
- lea edx, [edx+ebx*2]
- movdqa xmm1, [edx]
- movdqa xmm3, [edx+ebx]
- lea edx, [edx+ebx*2]
- pshuflw xmm1, xmm1, 0b1h
- pshuflw xmm3, xmm3, 0b1h
- pshufhw xmm1, xmm1, 0b1h
- pshufhw xmm3, xmm3, 0b1h
- punpcknb
- punpck bw, 0, 2, 4, 6, 1, 3
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 4, 2, 6, 1, 3
- movdqa [ecx+16*0], xmm0
- movdqa [ecx+16*1], xmm1
- movdqa [ecx+16*2], xmm4
- movdqa [ecx+16*3], xmm3
- ; col 1, 3
- movdqa xmm0, [edx]
- movdqa xmm2, [edx+ebx]
- lea edx, [edx+ebx*2]
- movdqa xmm1, [edx]
- movdqa xmm3, [edx+ebx]
- lea edx, [edx+ebx*2]
- pshuflw xmm0, xmm0, 0b1h
- pshuflw xmm2, xmm2, 0b1h
- pshufhw xmm0, xmm0, 0b1h
- pshufhw xmm2, xmm2, 0b1h
- punpcknb
- punpck bw, 0, 2, 4, 6, 1, 3
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 4, 2, 6, 1, 3
- movdqa [ecx+16*4], xmm0
- movdqa [ecx+16*5], xmm1
- movdqa [ecx+16*6], xmm4
- movdqa [ecx+16*7], xmm3
- add ecx, 128
- dec eax
- jnz @B
- pop ebx
- ret 4
- @SwizzleBlock4_sse2@12 endp
- ;
- ; swizzling with unaligned reads
- ;
- ;
- ; SwizzleBlock32u
- ;
- @SwizzleBlock32u_sse2@16 proc public
- push esi
- push edi
- mov edi, ecx
- mov esi, edx
- mov edx, [esp+4+8]
- mov ecx, 4
- mov eax, [esp+8+8]
- cmp eax, 0ffffffffh
- jnz SwizzleBlock32u_sse2@WM
- align 16
- @@:
- movdqu xmm0, [esi]
- movdqu xmm4, [esi+16]
- movdqu xmm1, [esi+edx]
- movdqu xmm5, [esi+edx+16]
- punpck qdq, 0, 4, 1, 5, 2, 6
- movdqa [edi+16*0], xmm0
- movdqa [edi+16*1], xmm2
- movdqa [edi+16*2], xmm4
- movdqa [edi+16*3], xmm6
- lea esi, [esi+edx*2]
- add edi, 64
- dec ecx
- jnz @B
- pop edi
- pop esi
- ret 8
- SwizzleBlock32u_sse2@WM:
- movd xmm7, eax
- pshufd xmm7, xmm7, 0
-
- align 16
- @@:
- movdqu xmm0, [esi]
- movdqu xmm4, [esi+16]
- movdqu xmm1, [esi+edx]
- movdqu xmm5, [esi+edx+16]
- punpck qdq, 0, 4, 1, 5, 2, 6
- movdqa xmm3, xmm7
- pshufd xmm5, xmm7, 0e4h
- pandn xmm3, [edi+16*0]
- pand xmm0, xmm7
- por xmm0, xmm3
- movdqa [edi+16*0], xmm0
- pandn xmm5, [edi+16*1]
- pand xmm2, xmm7
- por xmm2, xmm5
- movdqa [edi+16*1], xmm2
- movdqa xmm3, xmm7
- pshufd xmm5, xmm7, 0e4h
- pandn xmm3, [edi+16*2]
- pand xmm4, xmm7
- por xmm4, xmm3
- movdqa [edi+16*2], xmm4
- pandn xmm5, [edi+16*3]
- pand xmm6, xmm7
- por xmm6, xmm5
- movdqa [edi+16*3], xmm6
- lea esi, [esi+edx*2]
- add edi, 64
- dec ecx
- jnz @B
- pop edi
- pop esi
- ret 8
-
- @SwizzleBlock32u_sse2@16 endp
- ;
- ; SwizzleBlock16u
- ;
- @SwizzleBlock16u_sse2@12 proc public
- push ebx
- mov ebx, [esp+4+4]
- mov eax, 4
- align 16
- @@:
- movdqu xmm0, [edx]
- movdqu xmm1, [edx+16]
- movdqu xmm2, [edx+ebx]
- movdqu xmm3, [edx+ebx+16]
- punpck wd, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 4, 2, 6, 1, 5
- movdqa [ecx+16*0], xmm0
- movdqa [ecx+16*1], xmm1
- movdqa [ecx+16*2], xmm4
- movdqa [ecx+16*3], xmm5
- lea edx, [edx+ebx*2]
- add ecx, 64
- dec eax
- jnz @B
- pop ebx
- ret 4
- @SwizzleBlock16u_sse2@12 endp
- ;
- ; SwizzleBlock8u
- ;
- @SwizzleBlock8u_sse2@12 proc public
- push ebx
- mov ebx, [esp+4+4]
- mov eax, 2
- align 16
- @@:
- ; col 0, 2
- movdqu xmm0, [edx]
- movdqu xmm2, [edx+ebx]
- lea edx, [edx+ebx*2]
- movdqu xmm1, [edx]
- movdqu xmm3, [edx+ebx]
- pshufd xmm1, xmm1, 0b1h
- pshufd xmm3, xmm3, 0b1h
- lea edx, [edx+ebx*2]
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck wd, 0, 2, 4, 6, 1, 3
- punpck qdq, 0, 1, 2, 3, 4, 5
- movdqa [ecx+16*0], xmm0
- movdqa [ecx+16*1], xmm4
- movdqa [ecx+16*2], xmm1
- movdqa [ecx+16*3], xmm5
- ; col 1, 3
- movdqu xmm0, [edx]
- movdqu xmm2, [edx+ebx]
- pshufd xmm0, xmm0, 0b1h
- pshufd xmm2, xmm2, 0b1h
- lea edx, [edx+ebx*2]
- movdqu xmm1, [edx]
- movdqu xmm3, [edx+ebx]
- lea edx, [edx+ebx*2]
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck wd, 0, 2, 4, 6, 1, 3
- punpck qdq, 0, 1, 2, 3, 4, 5
- movdqa [ecx+16*4], xmm0
- movdqa [ecx+16*5], xmm4
- movdqa [ecx+16*6], xmm1
- movdqa [ecx+16*7], xmm5
- add ecx, 128
- dec eax
- jnz @B
- pop ebx
- ret 4
-
- @SwizzleBlock8u_sse2@12 endp
- ;
- ; SwizzleBlock4u
- ;
- @SwizzleBlock4u_sse2@12 proc public
- push ebx
-
- mov eax, 0f0f0f0fh
- movd xmm7, eax
- pshufd xmm7, xmm7, 0
- mov ebx, [esp+4+4]
- mov eax, 2
- align 16
- @@:
- ; col 0, 2
- movdqu xmm0, [edx]
- movdqu xmm2, [edx+ebx]
- lea edx, [edx+ebx*2]
- movdqu xmm1, [edx]
- movdqu xmm3, [edx+ebx]
- lea edx, [edx+ebx*2]
- pshuflw xmm1, xmm1, 0b1h
- pshuflw xmm3, xmm3, 0b1h
- pshufhw xmm1, xmm1, 0b1h
- pshufhw xmm3, xmm3, 0b1h
- punpcknb
- punpck bw, 0, 2, 4, 6, 1, 3
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 4, 2, 6, 1, 3
- movdqa [ecx+16*0], xmm0
- movdqa [ecx+16*1], xmm1
- movdqa [ecx+16*2], xmm4
- movdqa [ecx+16*3], xmm3
- ; col 1, 3
- movdqu xmm0, [edx]
- movdqu xmm2, [edx+ebx]
- lea edx, [edx+ebx*2]
- movdqu xmm1, [edx]
- movdqu xmm3, [edx+ebx]
- lea edx, [edx+ebx*2]
- pshuflw xmm0, xmm0, 0b1h
- pshuflw xmm2, xmm2, 0b1h
- pshufhw xmm0, xmm0, 0b1h
- pshufhw xmm2, xmm2, 0b1h
- punpcknb
- punpck bw, 0, 2, 4, 6, 1, 3
- punpck bw, 0, 2, 1, 3, 4, 6
- punpck qdq, 0, 4, 2, 6, 1, 3
- movdqa [ecx+16*4], xmm0
- movdqa [ecx+16*5], xmm1
- movdqa [ecx+16*6], xmm4
- movdqa [ecx+16*7], xmm3
- add ecx, 128
- dec eax
- jnz @B
- pop ebx
- ret 4
- @SwizzleBlock4u_sse2@12 endp
- end