processingSSE2.inl
资源名称:estereo2.zip [点击查看]
上传用户:fengshi120
上传日期:2014-07-17
资源大小:6155k
文件大小:9k
源码类别:
3D图形编程
开发平台:
C/C++
- /***************************************************************************
- *
- * Copyright 2000 by David Demirdjian. All rights reserved.
- *
- * Developed by David Demirdjian
- *
- * Permission to use, copy, or modify this software and its documentation
- * for educational and research purposes only and without fee is hereby
- * granted, provided that this copyright notice and the original authors's
- * names appear on all copies and supporting documentation. If individual
- * files are separated from this distribution directory structure, this
- * copyright notice must be included. For any other uses of this software,
- * in original or modified form, including but not limited to distribution
- * in whole or in part, specific prior permission must be obtained from
- * MIT. These programs shall not be used, rewritten, or adapted as the
- * basis of a commercial software or hardware product without first
- * obtaining appropriate licenses from David Demirdjian. The author makes
- * no representations about the suitability of this software for any purpose.
- * It is provided "as is" without express or implied warranty.
- *
- **************************************************************************/
- #include "stereoMatching.h"
- #include "processingmmx.h"
- // ************************************************************
- // ************************************************************
- // *** List of functions (SSE2) for image processing
- // ************************************************************
- // ************************************************************
- // Src1, Src2 and Dest suppose to point on 16-bytes memory block
- inline int ImgSubandAdd_sse2(const unsigned char *Src1, const unsigned char *Src2,
- const unsigned char *Src3, unsigned char *Dest, int l)
- {
- if (l < 8) return 0; // image size must be at least 8 bytes
- __asm
- {
- mov eax, Src1
- mov ebx, Src2
- mov edx, Src3
- mov edi, Dest
- mov ecx, l
- shr ecx, 4
- align 16
- inner_loop:
- movdqa xmm1,[eax] // xmm1=src1
- movdqa xmm2,[ebx] // mm2=src2
- movdqa xmm4,xmm1 // mm4=mm1
- psubusb xmm4,xmm2 // mm4 = src1 - src2
- movdqu xmm3,[edx] // mm3=src3
- psubusb xmm2,xmm1 // mm2 = src2 - src1
- movdqa xmm5,xmm1 // mm5=src1
- por xmm2,xmm4 // mm2=|src1-src2|
- psubusb xmm5,xmm3 // mm4=src1-src3
- psubusb xmm3,xmm1 // mm3=src3-src1
- por xmm3,xmm5 // mm3=|src1-src3|
- paddusb xmm2,xmm3 // mm2 = |src1-src2|+|src1-src3|
- movdqa [edi], xmm2
- add eax,16
- add ebx,16
- add edx,16
- add edi,16
- dec ecx
- jnz inner_loop
- emms
- }
- return 1;
- }
- #define macro_add_sse2 __asm
- {
- __asm paddusw xmm3, [edx]
- __asm paddusw xmm2, [edx+16]
- __asm add edx, edi
- }
- inline void avg_Col_5_sse2(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi // ebx = ebx-4*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 32
- // 1
- movdqa xmm3, [edx] // xmm3 = 8 words of im
- movdqa xmm2, [edx+16] // xmm3 = 8 words of im
- add edx, edi
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- // divide results by ...
- psrlw xmm3, 3
- psrlw xmm2, 3
- // convert [xmm2 xmm3] as 8 words
- packuswb xmm3,xmm2
- movdqa [ecx], xmm3
- sub eax, 16 // Update the number of points left
- add ecx, 16 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- inline void avg_Col_7_sse2(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi // ebx = ebx-4*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 32
- // 1
- movdqa xmm3, [edx] // xmm3 = 8 words of im
- movdqa xmm2, [edx+16] // xmm3 = 8 words of im
- add edx, edi
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- // divide results by ...
- psrlw xmm3, 3
- psrlw xmm2, 3
- // convert [xmm2 xmm3] as 8 words
- packuswb xmm3,xmm2
- movdqa [ecx], xmm3
- sub eax, 16 // Update the number of points left
- add ecx, 16 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- inline void avg_Col_9_sse2(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi // ebx = ebx-4*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 32
- // 1
- movdqa xmm3, [edx] // xmm3 = 8 words of im
- movdqa xmm2, [edx+16] // xmm3 = 8 words of im
- add edx, edi
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- // divide results by ...
- psrlw xmm3, 3
- psrlw xmm2, 3
- // convert [xmm2 xmm3] as 8 words
- packuswb xmm3,xmm2
- movdqa [ecx], xmm3
- sub eax, 16 // Update the number of points left
- add ecx, 16 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- inline void avg_Col_11_sse2(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi // ebx = ebx-4*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 32
- // 1
- movdqa xmm3, [edx] // xmm3 = 8 words of im
- movdqa xmm2, [edx+16] // xmm3 = 8 words of im
- add edx, edi
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- // divide results by ...
- psrlw xmm3, 3
- psrlw xmm2, 3
- // convert [xmm2 xmm3] as 8 words
- packuswb xmm3,xmm2
- movdqa [ecx], xmm3
- sub eax, 16 // Update the number of points left
- add ecx, 16 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- inline void avg_Col_13_sse2(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi // ebx = ebx-4*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 32
- // 1
- movdqa xmm3, [edx] // xmm3 = 8 words of im
- movdqa xmm2, [edx+16] // xmm3 = 8 words of im
- add edx, edi
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- macro_add_sse2
- // divide results by ...
- psrlw xmm3, 3
- psrlw xmm2, 3
- // convert [xmm2 xmm3] as 8 words
- packuswb xmm3,xmm2
- movdqa [ecx], xmm3
- sub eax, 16 // Update the number of points left
- add ecx, 16 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- // apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im'
- // result in 'im_out'
- inline void avg_Col_sse2(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
- {
- switch (sizeMask)
- {
- case 5: avg_Col_5_sse2(im,im_out,dataSize,width);
- break;
- case 7: avg_Col_7_sse2(im,im_out,dataSize,width);
- break;
- case 9: avg_Col_9_sse2(im,im_out,dataSize,width);
- break;
- case 11: avg_Col_11_sse2(im,im_out,dataSize,width);
- break;
- case 13: avg_Col_13_sse2(im,im_out,dataSize,width);
- break;
- case 15: avg_Col_15(im,im_out,dataSize,width);
- break;
- case 17: avg_Col_17(im,im_out,dataSize,width);
- break;
- default: if (sizeMask<5) avg_Col_5_sse2(im,im_out,dataSize,width);
- else if (sizeMask>17) avg_Col_17(im,im_out,dataSize,width);
- }
- }