processingMMX.inl
资源名称:estereo2.zip [点击查看]
上传用户:fengshi120
上传日期:2014-07-17
资源大小:6155k
文件大小:54k
源码类别:
3D图形编程
开发平台:
C/C++
- /***************************************************************************
- *
- * Copyright 2000 by David Demirdjian. All rights reserved.
- *
- * Developed by David Demirdjian
- *
- * Permission to use, copy, or modify this software and its documentation
- * for educational and research purposes only and without fee is hereby
- * granted, provided that this copyright notice and the original authors's
- * names appear on all copies and supporting documentation. If individual
- * files are separated from this distribution directory structure, this
- * copyright notice must be included. For any other uses of this software,
- * in original or modified form, including but not limited to distribution
- * in whole or in part, specific prior permission must be obtained from
- * MIT. These programs shall not be used, rewritten, or adapted as the
- * basis of a commercial software or hardware product without first
- * obtaining appropriate licenses from David Demirdjian. The author makes
- * no representations about the suitability of this software for any purpose.
- * It is provided "as is" without express or implied warranty.
- *
- **************************************************************************/
- #include "stereoMatching.h"
- #include "processingMMX.h"
- // ************************************************************
- // ************************************************************
- // *** List of functions (MMX) for image processing
- // ************************************************************
- // ************************************************************
- // shrink images by a factor 'fact'. eg if fact = 2, out will be twice as small as src
- inline void shrinkImages(uchar* dst, const uchar* src, int width, int height, int fact)
- {
- int width_f = width/fact;
- int siz = width*height/(fact*fact);
- if (fact>0) {
- for (int i=0,j=0; i<siz; ++i,++j,++dst,src+=fact) {
- *dst = *src;
- if (j==width_f-1) {
- src+=((fact-1)*width);
- j=0;
- }
- }
- }
- }
- // translate image of 'tx' pixels to the right
- // (or left if tx<0)
- void translateImage(int tx, uchar* data, int siz)
- {
- if (tx==0) return;
- if (tx>0) {
- // dest. must be after src... to avoid overwriting data
- data += (siz-tx);
- uchar* dataDst = data+tx;
- for (int i=0; i<siz-tx; ++i,--dataDst,--data) {
- *dataDst = *data;
- }
- } else { // tx<0
- data -= tx;
- uchar* dataDst = data+tx;
- for (int i=0; i<siz-tx; ++i,++dataDst,++data) {
- *dataDst = *data;
- }
- }
- }
- void normalizeImages(uchar* data1, uchar* data2, uchar* data3, int siz)
- {
- float a1 = pixelMean(data1,siz);
- float a2 = pixelMean(data2,siz);
- float a3 = pixelMean(data3,siz);
- float minI = __min(a1, __min(a2,a3));
- if (a2==minI) {
- multiply(data1, a2/a1, siz);
- multiply(data3, a2/a3, siz);
- } else if (a1==minI) {
- multiply(data2, a1/a2, siz);
- multiply(data3, a1/a3, siz);
- } else {
- multiply(data2, a3/a2, siz);
- multiply(data1, a3/a1, siz);
- }
- }
- void normalizeImages(const uchar* data1, const uchar* data2, const uchar* data3,
- uchar* out1, uchar* out2, uchar* out3, int siz)
- {
- float a1 = pixelMean(data1,siz);
- float a2 = pixelMean(data2,siz);
- float a3 = pixelMean(data3,siz);
- float minI = __min(a1, __min(a2,a3));
- if (a2==minI) {
- multiply(data1, out1, a2/a1, siz);
- multiply(data3, out3, a2/a3, siz);
- copyMMX(out2, data2, siz);
- } else if (a1==minI) {
- multiply(data2, out2, a1/a2, siz);
- multiply(data3, out3, a1/a3, siz);
- copyMMX(out1, data1, siz);
- } else {
- multiply(data2, out2, a3/a2, siz);
- multiply(data1, out1, a3/a1, siz);
- copyMMX(out3, data3, siz);
- }
- }
- void normalizeImages(uchar* data1, uchar* data2, int siz)
- {
- float a1 = pixelMean(data1,siz);
- float a2 = pixelMean(data2,siz);
- // normalize the image which average intensity is the highest
- if (a1>a2)
- multiply(data1, a2/a1, siz);
- else
- multiply(data2, a1/a2, siz);
- }
- void normalizeImages(const uchar* data1, const uchar* data2,
- uchar* out1, uchar* out2, int siz)
- {
- float a1 = pixelMean(data1,siz);
- float a2 = pixelMean(data2,siz);
- // normalize the image which average intensity is the highest
- if (a1>a2) {
- multiply(data1, out1, a2/a1, siz);
- copyMMX(out2, data2, siz);
- } else {
- multiply(data2, out2, a1/a2, siz);
- copyMMX(out1, data1, siz);
- }
- }
- // ImgSub2: D = saturation0(|S1 - S2| + |S1 - S3|)
- // TODO? divide the result by 2 (shift)
- inline int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2,
- const unsigned char *Src3, unsigned char *Dest, int l)
- {
- if (l < 8) return 0; // image size must be at least 8 bytes
- __asm
- {
- mov eax, Src1
- mov ebx, Src2
- mov edx, Src3
- mov edi, Dest
- mov ecx, l
- shr ecx, 3
- align 16
- inner_loop:
- movq mm1,[eax] // mm1=src1
- movq mm2,[ebx] // mm2=src2
- movq mm4,mm1 // mm4=mm1
- psubusb mm4,mm2 // mm4 = src1 - src2
- movq mm3,[edx] // mm3=src3
- psubusb mm2,mm1 // mm2 = src2 - src1
- movq mm5,mm1 // mm5=src1
- por mm2,mm4 // mm2=|src1-src2|
- psubusb mm5,mm3 // mm4=src1-src3
- psubusb mm3,mm1 // mm3=src3-src1
- por mm3,mm5 // mm3=|src1-src3|
- paddusb mm2,mm3 // mm2 = |src1-src2|+|src1-src3|
- movq [edi], mm2
- add eax,8
- add ebx,8
- add edx,8
- add edi,8
- dec ecx
- jnz inner_loop
- emms
- }
- return 1;
- }
- // ImgSub2: D = saturation0(|S1 - S2|)
- // TODO? divide the result by 2 (shift)
- inline int ImgSubandAdd(const unsigned char *Src1, const unsigned char *Src2,
- const unsigned char *Dest, int l)
- {
- if (l < 8) return 0; // image size must be at least 8 bytes
- __asm
- {
- mov eax, Src1
- mov ebx, Src2
- mov edi, Dest
- mov ecx, l
- shr ecx, 3
- align 16
- inner_loop:
- movq mm1,[eax] // mm1=src1
- movq mm2,[ebx] // mm2=src2
- movq mm4,mm1 // mm4=mm1
- psubusb mm4,mm2 // mm4 = src1 - src2
- psubusb mm2,mm1 // mm2 = src2 - src1
- por mm2,mm4 // mm2=|src1-src2|
- movq [edi], mm2
- add eax,8
- add ebx,8
- add edi,8
- dec ecx
- jnz inner_loop
- emms
- }
- return 1;
- }
- #define _ABS_DIFF_TRI(Z) __asm
- {
- __asm movq mm4,mm1 /* mm4=mm1 */
- __asm add ebx, width
- __asm add edi, imageSize
- __asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */
- __asm movq mm7, mm0
- __asm psubusb mm4,mm2 /* mm4 = src1 - src2 */
- __asm psubusb mm2,mm1 /* mm2 = src2 - src1 */
- __asm psllq mm7,Z
- __asm movq mm5,mm1 /* mm5=src1 */
- __asm por mm4,mm2 /* mm2=|src1-src2| */
- __asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/
- __asm psubusb mm5,mm3 /* mm5=src1-src3*/
- __asm movq mm6,mm3 /* mm6=src3*/
- __asm psubusb mm6,mm1 /* mm3=src3-src1*/
- __asm por mm6,mm5 /* mm6=|src1-src3|*/
- __asm paddusb mm4,mm6 /* mm4 = |src1-src2|+|src1-src3|*/
- __asm movq [edi], mm4 /* here mm1=src1*/
- __asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/
- }
- #define _ABS_DIFF_TRI_prefetch(Z, X) __asm
- {
- __asm movq mm4,mm1 /* mm4=mm1 */
- __asm add ebx, width
- __asm add edi, imageSize
- __asm por mm3,mm7 /* here mm2=new src2 mm3=new src3 */
- __asm movq mm7, mm0
- __asm psubusb mm4,mm2 /* mm4 = src1 - src2 */
- __asm psubusb mm2,mm1 /* mm2 = src2 - src1 */
- __asm prefetcht0 [ebx + X]
- __asm psllq mm7,Z
- __asm movq mm5,mm1 /* mm5=src1 */
- __asm por mm4,mm2 /* mm2=|src1-src2| */
- __asm movq mm2,[ebx] /* mm2= src2 + 'width' = new src2*/
- __asm psubusb mm5,mm3 /* mm5=src1-src3*/
- __asm movq mm6,mm3 /* mm6=src3*/
- __asm psubusb mm6,mm1 /* mm3=src3-src1*/
- __asm por mm6,mm5 /* mm6=|src1-src3|*/
- __asm paddusb mm4,mm6 /* mm4 = |src1-src2|+|src1-src3|*/
- __asm movq [edi], mm4 /* here mm1=src1*/
- __asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end*/
- }
- // ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|)
- // process 8 disparities at a time
- //
- // Src1: right
- // Src2: top
- // Src3: left
- //
- // TODO? divide the result by 2 (shift)
- inline int ImgSubandAdd2(const unsigned char *Src1, const unsigned char *Src2,
- const unsigned char *Src3,
- unsigned char* Dest1, int l, int imageSize, int width)
- {
- if (l < 8) return 0; // image size must be at least 8 bytes
- const int back_step1 = 7*width;
- const int back_step2 = 7*imageSize;
- __asm
- {
- mov eax, Src1
- mov ebx, Src2
- mov edx, Src3
- mov edi, Dest1
- mov ecx, l
- shr ecx, 3
- movq mm0,[edx] // mm0=src3
- movq mm0,[edx] // mm0=src3
- align 16
- inner_loop:
- movq mm1,[eax] // mm1=src1
- movq mm3,mm0 // mm3=src3
- movq mm2,[ebx] // mm2=src2
- add eax,8
- // -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
- movq mm4,mm1 // mm4=mm1
- add ebx,width
- psubusb mm4,mm2 // mm4 = src1 - src2
- //prefetcht0 [ebx + 32 + 2*320]
- movq mm0,[edx+8]
- psubusb mm2,mm1 // mm2 = src2 - src1
- movq mm5,mm1 // mm5=src1
- por mm4,mm2 // mm2=|src1-src2|
- movq mm2,[ebx] // mm2= src2 + 'width' = new src2
- psubusb mm5,mm3 // mm5=src1-src3
- movq mm6,mm3 // mm6=src3
- psubusb mm6,mm1 // mm3=src3-src1
- movq mm7, mm0
- psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end
- por mm6,mm5 // mm6=|src1-src3|
- paddusb mm4,mm6 // mm4 = |src1-src2|+|src1-src3|
- movq [edi], mm4
- psllq mm7, 56 // here mm1=src1 mm2=NEW src2 mm3=begin of NEWsrc3 mm7=end of NEWsrc3
- // -------------------------------------------------------------
- // - 2 ----------------
- _ABS_DIFF_TRI(48)
- // - 3 ----------------
- _ABS_DIFF_TRI(40)
- // - 4 ----------------
- _ABS_DIFF_TRI(32)
- // _ABS_DIFF_TRI_prefetch(32,24 + 3*320)
- // - 5 ----------------
- _ABS_DIFF_TRI(24)
- // - 6 ----------------
- _ABS_DIFF_TRI(16)
- // - 7 ----------------
- _ABS_DIFF_TRI(8)
- // - 8 ----------------
- movq mm4,mm1 // mm4=mm1
- por mm3,mm7 // here mm2=new src2 mm3=new src3
- psubusb mm4,mm2 // mm4 = src1 - src2
- psubusb mm2,mm1 // mm2 = src2 - src1
- movq mm5,mm1 // mm5=src1
- por mm4,mm2 // mm2=|src1-src2|
- psubusb mm5,mm3 // mm5=src1-src3
- psubusb mm3,mm1 // mm3=src3-src1
- por mm3,mm5 // mm6=|src1-src3|
- paddusb mm4,mm3 // mm4 = |src1-src2|+|src1-src3|
- add edi, imageSize
- movq [edi], mm4 // here mm1=src1
- // -------------------------------------------------------------
- //
- sub ebx, back_step1
- add ebx,8
- add edx,8
- sub edi, back_step2
- add edi,8
- dec ecx
- jnz inner_loop
- emms
- }
- return 1;
- }
- // macro: in: mm1,mm2
- #define _ABS_DIFF_ __asm
- {
- __asm movq mm4,mm1 /* mm4=mm1 */
- __asm psubusb mm4,mm2 /* mm4 = src1 - src2 */
- __asm psubusb mm2,mm1 /* mm2 = src2 - src1 */
- __asm por mm4,mm2 /* mm2=|src1-src2| */
- __asm add ebx, width
- __asm add edi, imageSize
- __asm movq mm2,[ebx]
- __asm movq [edi], mm4 /* here mm1=src1 */
- }
- // ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|)
- // process 8 disparities at a time
- // Src1: right
- // Src2: top
- // TODO? divide the result by 2 (shift)
- inline int ImgSubandAdd2_Vert(const unsigned char *Src1, const unsigned char *Src2,
- unsigned char* Dest1, int l, int imageSize, int width)
- {
- if (l < 8) return 0; // image size must be at least 8 bytes
- const int back_step1 = 7*width;
- const int back_step2 = 7*imageSize;
- __asm
- {
- mov eax, Src1
- mov ebx, Src2
- mov edi, Dest1
- mov ecx, l
- shr ecx, 3
- align 16
- inner_loop:
- movq mm1,[eax] // mm1=src1
- movq mm2,[ebx] // mm2=src2
- add eax,8
- // -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
- _ABS_DIFF_
- _ABS_DIFF_
- _ABS_DIFF_
- _ABS_DIFF_
- _ABS_DIFF_
- _ABS_DIFF_
- _ABS_DIFF_
- // - 8 ----------------
- movq mm4,mm1 // mm4=mm1
- psubusb mm4,mm2 // mm4 = src1 - src2
- psubusb mm2,mm1 // mm2 = src2 - src1
- por mm4,mm2 // mm2=|src1-src2|
- add edi, imageSize
- movq [edi], mm4 // here mm1=src1
- // -------------------------------------------------------------
- //
- sub ebx, back_step1
- add ebx,8
- sub edi, back_step2
- add edi,8
- dec ecx
- jnz inner_loop
- emms
- }
- return 1;
- }
- // macro: in: mm1,mm2
- #define _ABS_DIFF_HORIZ(Z) __asm
- {
- __asm movq mm7, mm0
- __asm add edi, imageSize
- __asm movq mm5,mm1 /* mm5=src1 */
- __asm psllq mm7, Z
- __asm psubusb mm5,mm3 /* mm5=src1-src3 */
- __asm movq mm6,mm3 /* mm6=src3 */
- __asm psubusb mm6,mm1 /* mm3=src3-src1 */
- __asm por mm6,mm5 /* mm6=|src1-src3| */
- __asm movq [edi], mm6 /* here mm1=src1 */
- __asm psrlq mm3, 8 /* mm3 = src3 + '1' ... with [x00000000] at the end */
- __asm por mm3,mm7 /* here mm3=new src3 */
- }
- // ImgSubandAdd2: D = saturation0(|S1 - S2| + |S1 - S3|)
- // process 8 disparities at a time
- //
- // Src1: right
- // Src2: top
- // Src3: left
- //
- // TODO? divide the result by 2 (shift)
- inline int ImgSubandAdd_Horiz(const unsigned char *rightIm, const unsigned char *leftIm,
- unsigned char* Dest, int l, int imageSize, int width)
- {
- if (l < 8) return 0; // image size must be at least 8 bytes
- const int back_step2 = 7*imageSize;
- __asm
- {
- mov eax, rightIm
- mov edx, leftIm
- mov edi, Dest
- mov ecx, l
- shr ecx, 3
- movq mm0,[edx] // mm0=src3
- movq mm0,[edx] // mm0=src3
- align 16
- inner_loop:
- movq mm1,[eax] // mm1=src1
- movq mm3,mm0 // mm3=src3
- // -- 1 --------- in : mm1,mm2,mm3 out: mm4=SAD mm2=new mm2 --
- movq mm0,[edx+8]
- add eax,8
- movq mm5,mm1 // mm5=src1
- psubusb mm5,mm3 // mm5=src1-src3
- movq mm6,mm3 // mm6=src3
- psubusb mm6,mm1 // mm3=src3-src1
- movq mm7, mm0
- psrlq mm3, 8 // mm3 = src3 + '1' ... with [x00000000] at the end
- por mm6,mm5 // mm6=|src1-src3|
- movq [edi], mm6
- psllq mm7, 56 // here mm1=src1 mm3=begin of NEWsrc3 mm7=end of NEWsrc3
- por mm3,mm7 // here mm3=new src3
- // - 2 ----------------
- _ABS_DIFF_HORIZ(48)
- _ABS_DIFF_HORIZ(40)
- _ABS_DIFF_HORIZ(32)
- _ABS_DIFF_HORIZ(24)
- _ABS_DIFF_HORIZ(16)
- _ABS_DIFF_HORIZ(8)
- // - 8 ----------------
- movq mm5,mm1 // mm5=src1
- add edi, imageSize
- psubusb mm5,mm3 // mm5=src1-src3
- psubusb mm3,mm1 // mm3=src3-src1
- por mm3,mm5 // mm6=|src1-src3|
- movq [edi], mm3
- // -------------------------------------------------------------
- //
- add edx,8
- sub edi, back_step2
- add edi,8
- dec ecx
- jnz inner_loop
- emms
- }
- return 1;
- }
- // ----------------------
- // FULL IMAGE, BEST ONLY : Keith's code
- inline int findMinimumCorrelation_mmx(
- const unsigned char *CurrentCorrelation,
- unsigned char CurrentDisparity,
- unsigned char *Disparity,
- unsigned char *BestCorrelation, int bytecount)
- {
- if ((bytecount < 8) || ((bytecount % 8) != 0)) {
- return 0;
- }
- __asm {
- // load ecx with the pixelblock count = bytecount / 8
- mov ecx, bytecount
- shr ecx, 3
- // setup mm0 with 8 copies of the disparity constant
- mov al, CurrentDisparity
- mov ah, al
- mov bx, ax
- shl eax, 16
- mov ax, bx
- movd mm0, eax
- movd mm1, eax
- punpckldq mm0, mm1
- // setup mm1 with 8 copies of the xor constant for unsigned => signed conversion
- mov eax, 0x80808080
- movd mm1, eax
- movd mm2, eax
- punpckldq mm1, mm2
- // setup the image pointers
- mov eax, BestCorrelation
- mov esi, CurrentCorrelation
- mov edi, Disparity
- pixel_loop:
- movq mm2, [esi] // current correlation
- movq mm3, [eax] // best correlation
- // check for updates
- movq mm5, mm2 // copy the current correlation
- pxor mm5, mm1 // convert from unsigned range to signed range
- movq mm6, mm3 // copy the best correlation
- pxor mm6, mm1 // convert from unsigned range to signed range
- pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask
- // 1 indicates current > best, so keep best
- // 0 indicates current <= best, so use new value
- // BYPASS
- // this phase adds 8 additional instructions, but could skip 2 writes and 1 read
- // abort remainder if not updating best correlation
- pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
- pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5
- // 0 indicates current > best, so keep best
- // 1 indicates current <= best, so use new value
- packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
- // 11111111 11111111 => 11111111 some replaced
- // 11111111 00000000 => 11111111 some replaced
- // 00000000 11111111 => 11111111 some replaced
- // 00000000 00000000 => 00000000 no replacements
- // we don't need to backup ebx because its not used in this routine
- // movd mm7, ebx // make a backup of eax
- movd ebx, mm6 // get the saturated mask
- test ebx, ebx // test ebx => yields 0 iff no substitutions will occur
- // movd ebx, mm7 // restore ebx
- jz bypass // store mm4 (second correlation) to [ebx]
- // Update best Correlation
- movq mm6, mm5 // mm6 := mask
- movq mm7, mm5 // mm7 := mask
- pand mm6, mm3 // best correlation values to keep
- pandn mm7, mm2 // current correlation value to move to best correlation
- por mm6, mm7 // merge values
- movq [eax], mm6 // store values
- // update disparity
- movq mm2, [edi] // get disparity map
- movq mm6, mm5 // mm6 := mask
- pand mm5, mm2 // select disparity map values to keep
- pandn mm6, mm0 // select current disparity values to move to disparity map
- por mm5, mm6 // merge values
- movq [edi], mm5 // store values
- bypass:
- add eax, 8
- add esi, 8
- add edi, 8
- dec ecx
- jnz pixel_loop
- emms;
- }
- return 1;
- }
- /*int initMinimumCorrelation(
- const unsigned char *CurrentCorrelation,
- unsigned char disparityInit,
- unsigned char *Disparity,
- unsigned char *BestCorrelation,
- unsigned char *SecondCorrelation,
- int bytecount)
- {
- for (int i=0; i<bytecount; ++i)
- {
- BestCorrelation[i]=255;
- SecondCorrelation[i]=255;
- Disparity[i]=0;
- }
- return 0;
- }*/
- inline int initMinimumCorrelation(
- const unsigned char *CurrentCorrelation,
- unsigned char disparityInit,
- unsigned char *Disparity,
- unsigned char *BestCorrelation,
- unsigned char *SecondCorrelation,
- int bytecount)
- {
- if ((bytecount < 8) || ((bytecount % 8) != 0)) {
- return 0;
- }
- __asm {
- // setup mm0 with 8 copies of the disparity constant
- mov al, disparityInit
- mov ah, al
- mov bx, ax
- shl eax, 16
- mov ax, bx
- movd mm0, eax
- movd mm1, eax
- punpckldq mm0, mm1
- // load ecx with the pixelblock count = bytecount / 8
- mov ecx, bytecount
- shr ecx, 3
- mov eax, BestCorrelation
- mov ebx, SecondCorrelation
- mov esi, CurrentCorrelation
- mov edx, Disparity
- pixel_loop:
- movq mm1, [esi]
- movq [eax], mm1 // Best = Current
- movq [ebx], mm1 // Second = Current
- movq [edx], mm0 // Disparity = disparityInit
- add eax, 8
- add ebx, 8
- add edx, 8
- add esi, 8
- dec ecx
- jnz pixel_loop
- jmp done
- done:
- emms;
- }
- }
- inline int findMinimumCorrelation(
- const unsigned char *CurrentCorrelation,
- unsigned char CurrentDisparity,
- unsigned char *Disparity,
- unsigned char *BestCorrelation,
- unsigned char *SecondCorrelation,
- int bytecount)
- {
- for (int i=0; i<bytecount; ++i,++CurrentCorrelation,++Disparity,++BestCorrelation, ++SecondCorrelation)
- {
- if (*CurrentCorrelation<*BestCorrelation) {
- *Disparity = CurrentDisparity;
- *SecondCorrelation = *BestCorrelation;
- *BestCorrelation = *CurrentCorrelation;
- }
- }
- return 1;
- }
- // ----------------------
- // FULL IMAGE, BEST+SECOND .. Keith's code
- inline int findMinimumCorrelation_mmx(
- const unsigned char *CurrentCorrelation,
- unsigned char CurrentDisparity,
- unsigned char *Disparity,
- unsigned char *BestCorrelation,
- unsigned char *SecondCorrelation,
- int bytecount)
- {
- if ((bytecount < 8) || ((bytecount % 8) != 0)) {
- return 0;
- }
- __asm {
- // load ecx with the pixelblock count = bytecount / 8
- mov ecx, bytecount
- shr ecx, 3
- // setup mm0 with 8 copies of the disparity constant
- mov al, CurrentDisparity
- mov ah, al
- mov bx, ax
- shl eax, 16
- mov ax, bx
- movd mm0, eax
- movd mm1, eax
- punpckldq mm0, mm1
- // setup mm1 with 8 copies of the xor constant for unsigned => signed conversion
- mov eax, 0x80808080
- movd mm1, eax
- movd mm2, eax
- punpckldq mm1, mm2
- // setup the image pointers
- mov eax, BestCorrelation
- mov ebx, SecondCorrelation
- mov esi, CurrentCorrelation
- mov edi, Disparity
- pixel_loop:
- movq mm2, [esi] // current correlation
- movq mm4, [ebx] // second correlation
- // convert the current correlation from unsigned range to signed range
- movq mm5, mm2 // copy the current correlation
- pxor mm5, mm1 // convert from unsigned range to signed range
- movq mm7, mm5 // copy converted to mm7
- // check for second correlation updates
- movq mm6, mm4 // copy second best correlation
- pxor mm6, mm1 // convert from unsigned range to signed range
- pcmpgtb mm7, mm6 // mm7 := (current signed> second best) mask
- // BYPASS 1
- // skip remainder if second correlation is not to be updated
- // this phase adds an addition 8 instructions, but it could save as 1 memory read and 3 writes
- pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
- pxor mm6, mm7 // mm6 = mm7 xor 0xFFFFFFFF = not mm7
- // 0 indicates current > second, so keep old value
- // 1 indicates current <= second, so use new value
- packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
- // 11111111 11111111 => 11111111 some replaced
- // 11111111 00000000 => 11111111 some replaced
- // 00000000 11111111 => 11111111 some replaced
- // 00000000 00000000 => 00000000 no replacements
- // don't need to backup edx because its not used in this routine
- // movd mm3, edx // make a backup of edx
- movd edx, mm6 // get the saturated mask
- test edx, edx // test edx => yields 0 iff no replacements will occur
- // movd edx, mm3 // restore edx
- jz bypass1
- // direct update second correlation (get values from current)
- // mm7 already has mask
- // movq mm6, mm7 // mm6 := mask
- // pand mm6, mm4 // second correlation values to keep
- // pandn mm7, mm2 // current correlation values to move to second correlation
- // por mm6, mm7 // merge value => direct updated second correlation
- // movq mm4, mm6 // store values (*** this instruction could be eliminated!)
- pand mm4, mm7 // second correlation values to keep
- pandn mm7, mm2 // current correlation values to move to second correlation
- por mm4, mm7 // merge value => direct updated second correlation
- // check for best correlation updates
- movq mm3, [eax] // best correlation
- // mm5 has converted current correlation
- movq mm6, mm3 // copy the best correlation
- pxor mm6, mm1 // convert from unsigned range to signed range
- pcmpgtb mm5, mm6 // mm5 := (current signed> best) mask
- // 1 indicates current > best, so keep best
- // 0 indicates current <= best, so use new value
- // BYPASS 2
- // this phase adds 8 additional instructions, but could skip 2 writes and 1 read
- // abort remainder if not updating best correlation
- pcmpeqb mm6, mm6 // mm6 = 0xFFFFFFFF
- pxor mm6, mm5 // mm6 = mm5 xor 0xFFFFFFFF = not mm5
- // 0 indicates current > best, so keep best
- // 1 indicates current <= best, so use new value
- packsswb mm6, mm6 // pack it into the lower dword of mm6 (unsigned saturation)
- // 11111111 11111111 => 11111111 some replaced
- // 11111111 00000000 => 11111111 some replaced
- // 00000000 11111111 => 11111111 some replaced
- // 00000000 00000000 => 00000000 no replacements
- // don't need to backup edx because its not used in this routine
- // movd mm7, edx // make a backup of edx
- movd edx, mm6 // get the saturated mask
- test edx, edx // test edx => yields 0 iff no substitutions will occur
- // movd edx, mm7 // restore edx
- jz bypass2 // store mm4 (second correlation) to [ebx]
- // indirect update second correlation (pushed down from best)
- movq mm6, mm5 // mm6 := mask
- movq mm7, mm5 // mm7 := mask
- pand mm6, mm4 // second correlation values to keep
- pandn mm7, mm3 // best correlations to move to second correlation
- por mm6, mm7 // merge values
- movq [ebx], mm6 // store values
- // direct Update best Correlation
- movq mm6, mm5 // mm6 := mask
- movq mm7, mm5 // mm7 := mask
- pand mm6, mm3 // best correlation values to keep
- pandn mm7, mm2 // current correlation value to move to best correlation
- por mm6, mm7 // merge values
- movq [eax], mm6 // store values
- // update disparity
- movq mm2, [edi] // get disparity map
- movq mm6, mm5 // mm6 := mask
- pand mm5, mm2 // select disparity map values to keep
- pandn mm6, mm0 // select current disparity values to move to disparity map
- por mm5, mm6 // merge values
- movq [edi], mm5 // store values
- bypass1:
- next_pixel:
- add eax, 8
- add ebx, 8
- add esi, 8
- add edi, 8
- dec ecx
- jnz pixel_loop
- jmp done
- bypass2:
- movq [ebx], mm4;
- jmp next_pixel
- done:
- emms;
- }
- return 1;
- }
- inline void sum_Row(uchar* im, unsigned short* im_out, int rowSize, int maskSize)
- {
- im += maskSize/2;
- im_out += maskSize/2;
- for (int i=0; i<rowSize; ++i) {
- int s=0;
- for (int j=-maskSize/2; j<=maskSize/2; ++j) {
- s+=*(im+j);
- }
- *im_out=s/maskSize;
- ++im;++im_out;
- }
- }
- inline void sum_Row_mmx(uchar* im, unsigned short* im_out, int rowSize, int maskSize)
- {
- sum_Row_5_mmx(im, im_out, rowSize);
- for (int i=0; i<(maskSize-5)/2; ++i)
- sum_Row_5_mmx(im_out, im_out, rowSize);
- }
- inline void sum_Row_mmx(unsigned short* im, unsigned short* im_out, int rowSize, int maskSize)
- {
- sum_Row_5_mmx(im, im_out, rowSize);
- for (int i=0; i<(maskSize-5)/2; ++i)
- sum_Row_5_mmx(im_out, im_out, rowSize);
- }
- #define aim_Sum_Words_In_MM1 __asm
- {
- __asm movq mm4, mm1
- __asm movq mm2, mm1
- __asm movq mm3, mm1
- __asm psllq mm1, 16
- __asm psrlq mm2, 16
- __asm paddw mm4, mm2
- __asm paddw mm3, mm1
- __asm psrlq mm2, 16
- __asm psllq mm1, 16
- __asm paddw mm4, mm2
- __asm psrlq mm2, 16
- __asm paddw mm3, mm1
- __asm psllq mm1, 16
- __asm paddw mm4, mm2
- __asm paddw mm3, mm1
- }
- // apply the mask [1 1 1 1 1] to the 1-D array im (bytes)
- // output : im_out (words)
- inline void sum_Row_5_mmx(uchar* im, unsigned short* im_out, int rowSize)
- {
- // temp: for debugging
- //return sum_Row_5(im,im_out,rowSize);
- __asm {
- mov eax, rowSize
- mov ebx, im
- mov ecx, im_out
- pxor mm6, mm6 // mm6 = x00000000
- //Process the first quad word, but save only the second result"
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- //Process low word
- movq mm1, [ebx] // Copy...
- punpcklbw mm1, mm6 // Expand low word bytes into words // mm1 =[D C B A]
- aim_Sum_Words_In_MM1
- //Store the result Only in the accumulator
- movq mm7, mm4 // Update accumulator mm4=[D C+D B+C+D A+B+C+D]
- //Process high word
- movq mm1, [ebx] // Copy...
- punpckhbw mm1, mm6 // Expand high word bytes into words // mm1 =[H G F E]
- add ebx, 8 // Update input pointer
- aim_Sum_Words_In_MM1
- //Add to the previous data ...
- // mm3=[E+F+G+H E+F+G E+F E]
- // mm4=[H G+H F+G+H E+F+G+H]
- paddw mm7, mm3 // The current word of the accum // mm7=[D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
- // translate everything to 2 words on the left
- movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
- psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G]
- movq mm0, mm1 // mm0 = [D+E+F+G+H C+D+E+F+G]
- psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0]
- movq [ecx], mm7 // Store the final result
- add ecx, 8 // Update output pointer
- movq mm7, mm4 // Update accumulator mm4=[H G+H F+G+H E+F+G+H]
- sub eax, 8 // Update the number of points left
- // Start the loop
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- movq mm1, [ebx] // Load data
- //Process low word
- punpcklbw mm1, mm6 // Expand low word bytes into words
- aim_Sum_Words_In_MM1
- //Add to the previous data
- //prefetcht1 [ecx+16]
- paddw mm7, mm3 // The current word of the accum
- // translate everything to 2 words on the left
- // mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7=[0 0 H G] [ecx]=[F E D C]
- punpckldq mm0, mm7 // mm0 = [F E D C]
- movq [ecx], mm0
- sub eax, 8 // Update the number of points left
- movq mm0, mm4 // Update accumulator
- psrlq mm7, 32 // mm7 = [0 0 H G]
- //Process high word
- movq mm1, [ebx] // Copy...
- punpckhbw mm1, mm6 // Expand high word bytes into words
- aim_Sum_Words_In_MM1
- //Add to the previous data
- paddw mm0, mm3 // The current word of the accum
- // translate everything to 2 words on the left
- // mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C]
- punpckldq mm7, mm0 // mm7 = [F E D C]
- add ebx, 8 // Update input pointer
- movq [ecx+8], mm7
- psrlq mm0, 32 // mm0 = [0 0 H G]
- movq mm7, mm4 // Update accumulator
- add ecx, 16 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- // apply the mask (1/4)*[1 1 1 1 1] to the 1-D array im (words)
- // output : im_out (words)
- inline void sum_Row_5_mmx(ushort* im, ushort* im_out, int rowSize)
- {
- // temp: for debugging
- //return sum_Row_5(im,im_out,rowSize);
- __asm {
- mov eax, rowSize
- mov ebx, im
- mov ecx, im_out
- //Process the first quad word, but save only the second result"
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- movq mm1, [ebx] // Load data (4 words)
- add ebx, 8 // Update input pointer
- //Process low word
- aim_Sum_Words_In_MM1
- //Store the result Only in the accumulator
- movq mm7, mm4 // Update accumulator
- //Process high word
- movq mm1, [ebx] // Copy...
- aim_Sum_Words_In_MM1
- add ebx, 8
- //Add to the previous data
- paddw mm7, mm3 // The current word of the accum
- // translate everything to 2 words on the left
- movq mm1, mm7 // mm1 = [D+E+F+G+H C+D+E+F+G B+C+D+E+F A+B+C+D+E]
- psrlq mm1, 32 // mm1 = [0 0 D+E+F+G+H C+D+E+F+G]
- movq mm0, mm1 // mm0 = [0 0 D+E+F+G+H C+D+E+F+G]
- psllq mm7, 32 // mm7 = [B+C+D+E+F A+B+C+D+E 0 0]
- movq [ecx], mm7 // Store the final result
- movq mm7, mm4 // Update accumulator
- add ecx, 8 // Update output pointer
- sub eax, 8 // Update the number of points left
- // Start the loop
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- movq mm1, [ebx] // Load data
- aim_Sum_Words_In_MM1
- //Add to the previous data
- //prefetcht0 [ecx + 32]
- //prefetcht0 [ebx + 48]
- paddw mm7, mm3 // The current word of the accum
- psrlw mm7, 2 // divide result by ...
- // translate everything to 2 words on the left
- // mm0 = [0 0 D C] mm7 = [H G F E] ----> mm7 =[0 0 H G] [ecx]=[F E D C]
- punpckldq mm0, mm7 // mm0 = [F E D C]
- movq [ecx], mm0
- sub eax, 8 // Update the number of points left
- movq mm0, mm4 // Update accumulator
- psrlq mm7, 32 // mm7 =[0 0 H G]
- //Process high word
- movq mm1, [ebx+8] // Copy...
- aim_Sum_Words_In_MM1
- //Add to the previous data
- paddw mm0, mm3 // The current word of the accum
- psrlw mm0, 2 // divide result by ...
- // translate everything to 2 words on the left
- // mm7 = [0 0 D C] mm0 = [H G F E] ----> mm0=[0 0 H G] [ecx+8]=[F E D C]
- punpckldq mm7, mm0 // mm7 = [F E D C]
- add ebx, 16 // Update input pointer
- movq [ecx+8], mm7
- psrlq mm0, 32 // mm0 = [0 0 H G]
- movq mm7, mm4 // Update accumulator
- add ecx, 16 // Update output pointer */
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- template<class T> void sum_Row_5(T* im, ushort* im_out, int rowSize)
- {
- im += 2;
- im_out +=2;
- int s = 0;
- for (int i=0; i<rowSize-5; ++i, ++im, ++im_out) {
- s = *(im-2);
- s += *(im-1);
- s += *(im);
- s += *(im+1);
- s += *(im+2);
- *im_out = s/5;
- }
- }
- inline void avg_Col(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
- {
- int offset = width*(sizeMask/2);
- im += offset;
- im_out += offset;
- for (int i=0; i<dataSize-width*sizeMask; ++i, ++im, ++im_out) {
- int s = 0;
- for (int j=-sizeMask/2; j<=sizeMask/2; ++j) s += *(im+j*width);
- *im_out = s/(sizeMask);
- }
- }
- // apply vertical mask 1/16*[1 1 1 ... 1]^T to 'im'
- // result in 'im_out'
- inline void avg_Col_mmx(ushort* im, uchar* im_out, int dataSize, int width, int sizeMask)
- {
- // temp: for debugging
- //return avg_Col(im,im_out,dataSize,width,sizeMask);
- switch (sizeMask)
- {
- case 5: avg_Col_5(im,im_out,dataSize,width);
- break;
- case 7: avg_Col_7(im,im_out,dataSize,width);
- break;
- case 9: avg_Col_9(im,im_out,dataSize,width);
- break;
- case 11: avg_Col_11(im,im_out,dataSize,width);
- break;
- case 13: avg_Col_13(im,im_out,dataSize,width);
- break;
- case 15: avg_Col_15(im,im_out,dataSize,width);
- break;
- case 17: avg_Col_17(im,im_out,dataSize,width);
- break;
- default: if (sizeMask<5) avg_Col_5(im,im_out,dataSize,width);
- else if (sizeMask>17) avg_Col_17(im,im_out,dataSize,width);
- break;
- }
- }
- #define macro_add __asm
- {
- __asm paddusw mm3, [edx]
- __asm paddusw mm2, [edx+8]
- __asm add edx, edi
- }
- inline void avg_Col_5(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi // ebx = ebx-4*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 16
- // 1
- movq mm3, [edx] // mm3 = 4 words of im
- movq mm2, [edx+8] // mm2 = next 4 words of im
- add edx, edi
- macro_add
- macro_add
- macro_add
- macro_add
- // divide results by ...
- psrlw mm3, 3
- psrlw mm2, 3
- // convert [mm2 mm3] as 8 bytes
- packuswb mm3,mm2
- movq [ecx], mm3
- sub eax, 8 // Update the number of points left
- add ecx, 8 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- inline void avg_Col_7(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi // ebx = ebx-6*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- // 1
- movq mm3, [edx] // mm3 = 4 words of im
- add ebx, 16
- movq mm2, [edx+8] // mm2 = next 4 words of im
- add edx, edi
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- // divide results by ...
- psrlw mm3, 3
- psrlw mm2, 3
- // convert [mm2 mm3] as 8 bytes
- packuswb mm3,mm2
- movq [ecx], mm3
- sub eax, 8 // Update the number of points left
- add ecx, 8 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- inline void avg_Col_9(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi // ebx = ebx-8*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 16
- // 1
- movq mm3, [edx] // mm3 = 4 words of im
- movq mm2, [edx+8] // mm2 = next 4 words of im
- add edx, edi
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- // divide results by ...
- psrlw mm3, 3
- psrlw mm2, 3
- // convert [mm2 mm3] as 8 bytes
- packuswb mm3,mm2
- movq [ecx], mm3
- sub eax, 8 // Update the number of points left
- add ecx, 8 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- inline void avg_Col_11(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi // ebx = ebx-10*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 16
- // 1
- movq mm3, [edx] // mm3 = 4 words of im
- movq mm2, [edx+8] // mm2 = next 4 words of im
- add edx, edi
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- // divide results by ...
- psrlw mm3, 4
- psrlw mm2, 4
- // convert [mm2 mm3] as 8 bytes
- packuswb mm3,mm2
- movq [ecx], mm3
- sub eax, 8 // Update the number of points left
- add ecx, 8 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- inline void avg_Col_13(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi // ebx = ebx-12*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 16
- // 1
- movq mm3, [edx] // mm3 = 4 words of im
- movq mm2, [edx+8] // mm2 = next 4 words of im
- add edx, edi
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- // divide results by ...
- psrlw mm3, 4
- psrlw mm2, 4
- // convert [mm2 mm3] as 8 bytes
- packuswb mm3,mm2
- movq [ecx], mm3
- sub eax, 8 // Update the number of points left
- add ecx, 8 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- inline void avg_Col_15(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi // ebx = ebx-14*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 16
- // 1
- movq mm3, [edx] // mm3 = 4 words of im
- movq mm2, [edx+8] // mm2 = next 4 words of im
- add edx, edi
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- // divide results by ...
- psrlw mm3, 4
- psrlw mm2, 4
- // convert [mm2 mm3] as 8 bytes
- packuswb mm3,mm2
- movq [ecx], mm3
- sub eax, 8 // Update the number of points left
- add ecx, 8 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- inline void avg_Col_17(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi
- sub ebx, edi // ebx = ebx-16*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 16
- // 1
- movq mm3, [edx] // mm3 = 4 words of im
- movq mm2, [edx+8] // mm2 = next 4 words of im
- add edx, edi
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- macro_add
- // divide results by ...
- psrlw mm3, 4
- psrlw mm2, 4
- // convert [mm2 mm3] as 8 bytes
- packuswb mm3,mm2
- movq [ecx], mm3
- sub eax, 8 // Update the number of points left
- add ecx, 8 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- inline void add_Col_5_wb(ushort* im, uchar* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi // ebx = ebx-4*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 16
- // 1
- movq mm3, [edx] // mm3 = 4 words of im
- movq mm2, [edx+8] // mm2 = next 4 words of im
- add edx, edi
- macro_add
- macro_add
- macro_add
- macro_add
- // save [mm2 mm3] as 8 bytes
- packuswb mm3,mm2
- movq [ecx], mm3
- sub eax, 8 // Update the number of points left
- add ecx, 8 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- inline void add_Col_5_ww(ushort* im, ushort* im_out, int dataSize, int width)
- {
- __asm {
- mov edi, width
- shl edi, 1 // edi = 2*width
- mov eax, dataSize
- mov ecx, im_out
- mov ebx, im
- sub ebx, edi
- sub ebx, edi // ebx = ebx-4*width
- test eax, eax // Is there anything to do?"
- jz end_sum_loop // Jump out if necessary
- row_sum_loop:
- test eax, eax // Is there anything to do?
- jz end_sum_loop // Jump out if necessary
- mov edx, ebx
- add ebx, 16
- // 1
- movq mm3, [edx] // mm3 = 4 words of im
- movq mm2, [edx+8] // mm2 = next 4 words of im
- add edx, edi
- macro_add
- macro_add
- macro_add
- macro_add
- // save [mm2 mm3] as words
- movq [ecx], mm3
- movq [ecx+8], mm2
- sub eax, 8 // Update the number of points left
- add ecx, 16 // Update output pointer
- jmp row_sum_loop // Loop
- //Cleanup
- end_sum_loop:
- emms
- }
- }
- // compare bestScores and secondScores. if second<best+'thresh' the disp.
- // is set to 'valForReplacement' (usually 0)
- inline void compareBestAndSecond(uchar* bestScores, uchar* secondScores, char thresh,
- uchar undefined_val,
- uchar* disp, int dataSize)
- {
- __asm {
- // setup mm0 with 8 copies of 'thresh'
- mov al, thresh
- mov ah, al
- mov bx, ax
- shl eax, 16
- mov ax, bx
- movd mm0, eax
- movd mm1, eax
- punpckldq mm0, mm1
- // setup mm7 with 8 copies of 'valForReplacement'
- mov al, undefined_val
- mov ah, al
- mov bx, ax
- shl eax, 16
- mov ax, bx
- movd mm7, eax
- movd mm1, eax
- punpckldq mm7, mm1
- mov eax, dataSize
- mov ebx, bestScores
- mov ecx, secondScores
- mov edx, disp
- test eax, eax // Is there anything to do?"
- jz end_loop // Jump out if necessary
- comp_loop:
- test eax, eax // Is there anything to do?
- jz end_loop // Jump out if necessary
- movq mm2, [ecx]
- psubusb mm2, [ebx] // mm2 = secondScores - bestScores
- movq mm3, [edx] // mm3 = disp
- pcmpgtb mm2, mm0 // mm2 = 1 if mm2>thresh
- // 0 otherwise
- pand mm3, mm2
- pandn mm2, mm7
- por mm3, mm2
- movq [edx], mm3
- sub eax, 8 // Update the number of points left
- add ebx, 8 // Update output pointer
- add ecx, 8
- add edx, 8
- jmp comp_loop // Loop
- //Cleanup
- end_loop:
- emms
- }
- }
- // windowWidth must be multiple of 8
- inline void cropImage(const uchar* imSrc, int width, int height,
- uchar* imDest, int x0, int y0, int windowWidth, int windowHeight)
- {
- int w8 = windowWidth/8;
- int step = width-windowWidth;
- const uchar* srcNewOrigin = imSrc+x0+y0*width;
- __asm {
- mov ecx, windowHeight
- mov edx, w8
- mov eax, srcNewOrigin
- mov ebx, imDest
- pixel_loop:
- movq mm1, [eax]
- movq [ebx], mm1
- add eax, 8
- add ebx, 8
- dec edx
- jnz pixel_loop
- mov edx, w8
- add eax, step
- dec ecx
- jnz pixel_loop
- jmp done
- done:
- emms;
- }
- }
- // return the average pixel value
- inline float pixelMean(const uchar* im, int imageSize)
- {
- int sum;
- __asm {
- mov ecx, imageSize
- shr ecx, 3
- mov eax, im
- pxor mm7,mm7 // mm7 used as accumulator
- pxor mm0,mm0 // mm0 = 0
- pixel_loop:
- movq mm1, [eax]
- movq mm2,mm1
- punpcklbw mm2, mm0
- punpckhbw mm1, mm0
- paddw mm2,mm1
- movq mm1,mm2
- punpcklwd mm2, mm0
- punpckhwd mm1, mm0
- paddd mm2,mm1
- paddd mm7,mm2
- add eax, 8
- dec ecx
- jnz pixel_loop
- jmp done
- done:
- movd ebx, mm7
- psrlq mm7, 32
- movd edx, mm7
- add ebx, edx
- mov sum, ebx
- emms
- }
- return sum / (float)imageSize;
- }
- // -------------------------------------------------------------
- // apply mask:
- // if mask[]=undefined_val im[]->im[]
- // otherwise, im[]->mask[]
- // ....... this one may not be exact :-(
- inline void overrideImageMMX(uchar* im, const uchar* mask, uchar undefined_val, int imageSize)
- {
- __asm {
- // setup mm0 with 8 copies of 'undefined_val'
- mov al, undefined_val
- mov ah, al
- mov bx, ax
- shl eax, 16
- mov ax, bx
- movd mm0, eax
- movd mm1, eax
- punpckldq mm0, mm1
- mov ecx, imageSize
- shr ecx, 3
- mov eax, im
- mov ebx, mask
- pixel_loop:
- movq mm1, [eax]
- movq mm2, [ebx]
- movq mm3, mm2
- pcmpeqb mm3, mm0 // mm3[] -> xFF if mm2[]==undefined_val
- // -> x00 otherwise
- pand mm3, mm1 // mm3[] = mm1[] if mm2[]==undefined_val
- // = x00 otherwise
- por mm3, mm2
- movq [eax], mm3
- add eax, 8
- add ebx, 8
- dec ecx
- jnz pixel_loop
- jmp done
- done:
- emms
- }
- }
- inline void overrideImage(uchar* im, const uchar* mask, uchar undefined_val, int imageSize)
- {
- for (int i=0; i<imageSize; ++i, ++im,++mask)
- {
- if (*mask != undefined_val) *im=*mask;
- }
- }
- inline void divide( ushort* im, uchar* div, uchar* result, int imageSize)
- {
- for (int i=0; i<imageSize; ++i,++im,++div,++result)
- {
- *result = (*div)?(uchar)(*im / *div):0;
- }
- }
- // 5x5 sum filters
- inline void sum_5x5_mmx( uchar* im, ushort* im_out, int dataSize, int width, ushort* buff)
- {
- sum_Row_5_mmx(im, buff, dataSize);
- add_Col_5_ww(buff+2*width, im_out+2*width, dataSize-4*width , width);
- }
- inline void sum_5x5_mmx( uchar* im, uchar* im_out, int dataSize, int width, ushort* buff)
- {
- sum_Row_5_mmx(im, buff, dataSize);
- add_Col_5_wb(buff+2*width, im_out+2*width, dataSize-4*width , width);
- }
- inline void binarize(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
- {
- for (int i=0; i<dataSize; ++i,++im,++im_out)
- {
- *im_out = (*im != undefined_val);
- }
- }
- inline void set_undefined_to_zero(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
- {
- for (int i=0; i<dataSize; ++i,++im,++im_out)
- {
- if (*im == undefined_val) *im_out=0;
- }
- }
- inline void set_zero_to_undefined(uchar* im, uchar* im_out, uchar undefined_val, int dataSize)
- {
- for (int i=0; i<dataSize; ++i,++im,++im_out)
- {
- if (*im == 0) *im_out=undefined_val;
- }
- }
- inline void copyMMX(void* imDest, const void* imSrc, int dataSize)
- {
- __asm {
- mov ecx, dataSize
- shr ecx, 3
- mov eax, imSrc
- mov ebx, imDest
- sub ebx, 8
- pixel_loop:
- movq mm1, [eax]
- add ebx, 8
- movq [ebx], mm1
- add eax, 8
- dec ecx
- jnz pixel_loop
- jmp done
- done:
- emms
- }
- }
- inline void copySSE(void* imDest, const void* imSrc, int dataSize)
- {
- __asm {
- mov ecx, dataSize
- shr ecx, 4
- mov eax, imSrc
- mov ebx, imDest
- sub ebx, 16
- pixel_loop:
- movdqa xmm1, [eax]
- add ebx, 16
- movdqa [ebx], xmm1
- add eax, 16
- dec ecx
- jnz pixel_loop
- jmp done
- done:
- emms
- }
- }
- inline void setMMX(float* imDest, const float value, int dataSize)
- {
- __asm {
- // make 4 copies of the constant 'value' in xmm0
- movss xmm0, value
- movss xmm1, xmm0
- unpcklps xmm0, xmm1
- movlhps xmm0, xmm0
- mov ecx, dataSize
- shr ecx, 2
- mov ebx, imDest
- pixel_loop:
- movaps [ebx], xmm0
- add ebx, 16
- dec ecx
- jnz pixel_loop
- jmp done
- done:
- emms
- }
- }
- inline void setMMX(char* imDest, const char value, int dataSize)
- {
- __asm {
- // setup mm0 with 8 copies of 'value'
- mov al, value
- mov ah, al
- mov bx, ax
- shl eax, 16
- mov ax, bx
- movd mm0, eax
- movd mm1, eax
- punpckldq mm0, mm1
- mov ecx, dataSize
- shr ecx, 3
- mov ebx, imDest
- pixel_loop:
- movq [ebx], mm0
- add ebx, 8
- dec ecx
- jnz pixel_loop
- jmp done
- done:
- emms
- }
- }
- /*
- void copyRGBAtoRGB(const uchar* imSrc, uchar* imred,uchar* imgreen,uchar* imblue, int dataSize)
- {
- __asm {
- mov esi, dataSize
- shr esi, 3
- mov eax, imSrc
- mov ebx, imred
- mov ecx, imred
- mov edx, imred
- pixel_loop:
- movq mm1, [eax]
- movq [ebx], mm1
- add eax, 8
- add ebx, 8
- add ecx, 8
- add edx, 8
- dec esi
- jnz pixel_loop
- jmp done
- done:
- emms
- }
- }*/
- inline void multiply(uchar* im, float fact, int imageSize)
- {
- __asm {
- mov ecx, imageSize
- shr ecx, 3
- // make 4 copies of the constant 'fact' in xmm0
- movss xmm0, fact
- movss xmm1, xmm0
- unpcklps xmm0, xmm1
- movlhps xmm0, xmm0
- mov eax, im
- pxor mm7,mm7 // mm7 = 0
- pixel_loop:
- movq mm1, [eax]
- movq mm2, mm1
- punpcklbw mm2, mm0
- punpckhbw mm1, mm0
- movq mm3,mm2
- punpckhwd mm3, mm0
- punpcklwd mm2, mm0
- movq mm4,mm1
- punpcklwd mm4, mm0
- punpckhwd mm1, mm0
- // here, the first 8 bytes are in d-words [mm1 mm4 mm3 mm2]
- // --------
- cvtpi2ps xmm3, mm3 // put mm3 in low part of xmm3
- cvtpi2ps xmm2, mm2 // put mm2 in low part of xmm2
- movlhps xmm2, xmm3 // xmm2 = [xmm3(low part) xmm2(low part)]
- mulps xmm2, xmm0
- cvtps2pi mm2, xmm2 // convert low 2 floats from xmm2 to mm2
- movhlps xmm3,xmm2 // mov high 2 floats from xmm2 to low 2 floats in xmm3
- cvtps2pi mm3, xmm3 // convert low 2 floats from xmm3 to mm3
- packssdw mm2, mm3 // mm2 = (word)[mm2 mm3]
- // --------
- cvtpi2ps xmm4, mm4
- cvtpi2ps xmm1, mm1
- movlhps xmm4, xmm1
- mulps xmm4, xmm0
- cvtps2pi mm4, xmm4 // convert low 2 floats from xmm4 to mm4
- movhlps xmm1,xmm4 // mov high 2 floats from xmm4 to low 2 floats in xmm1
- cvtps2pi mm1, xmm1 // convert low 2 floats from xmm1 to mm1
- packssdw mm4, mm1 // mm4 = (word)[mm1 mm4]
- // ------
- packuswb mm2, mm4 // mm2 = [[mm4] [mm2]] = [mm1 mm4 mm2 mm3]
- movq [eax], mm2
- add eax, 8
- dec ecx
- jnz pixel_loop
- jmp done
- done:
- emms
- }
- }
- inline void multiply(const uchar* imSrc, uchar* imDest, float fact, int imageSize)
- {
- __asm {
- mov ecx, imageSize
- shr ecx, 3
- // make 4 copies of the constant 'fact' in xmm0
- movss xmm0, fact
- movss xmm1, xmm0
- unpcklps xmm0, xmm1
- movlhps xmm0, xmm0
- mov eax, imSrc
- mov ebx, imDest
- pxor mm7,mm7 // mm7 = 0
- pixel_loop:
- movq mm1, [eax]
- movq mm2, mm1
- punpcklbw mm2, mm0
- punpckhbw mm1, mm0
- movq mm3,mm2
- punpckhwd mm3, mm0
- punpcklwd mm2, mm0
- movq mm4,mm1
- punpcklwd mm4, mm0
- punpckhwd mm1, mm0
- // here, the first 8 bytes are in d-words [mm1 mm4 mm3 mm2]
- // --------
- cvtpi2ps xmm3, mm3 // put mm3 in low part of xmm3
- cvtpi2ps xmm2, mm2 // put mm2 in low part of xmm2
- movlhps xmm2, xmm3 // xmm2 = [xmm3(low part) xmm2(low part)]
- mulps xmm2, xmm0
- cvtps2pi mm2, xmm2 // convert low 2 floats from xmm2 to mm2
- movhlps xmm3,xmm2 // mov high 2 floats from xmm2 to low 2 floats in xmm3
- cvtps2pi mm3, xmm3 // convert low 2 floats from xmm3 to mm3
- packssdw mm2, mm3 // mm2 = (word)[mm2 mm3]
- // --------
- cvtpi2ps xmm4, mm4
- cvtpi2ps xmm1, mm1
- movlhps xmm4, xmm1
- mulps xmm4, xmm0
- cvtps2pi mm4, xmm4 // convert low 2 floats from xmm4 to mm4
- movhlps xmm1,xmm4 // mov high 2 floats from xmm4 to low 2 floats in xmm1
- cvtps2pi mm1, xmm1 // convert low 2 floats from xmm1 to mm1
- packssdw mm4, mm1 // mm4 = (word)[mm1 mm4]
- // ------
- packuswb mm2, mm4 // mm2 = [[mm4] [mm2]] = [mm1 mm4 mm2 mm3]
- movq [ebx], mm2
- add eax, 8
- add ebx, 8
- dec ecx
- jnz pixel_loop
- jmp done
- done:
- emms
- }
- }