pnggccrd.c
上传用户:szled88
上传日期:2015-04-09
资源大小:43957k
文件大小:235k
- }
- else if (pixel_bytes == 8)
- {
- for (i = width; i; i--)
- {
- png_byte v[8];
- int j;
- png_memcpy(v, sptr, 8);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, 8);
- dp -= 8;
- }
- sptr -= 8;
- }
- }
- else /* GRR: should never be reached */
- {
- for (i = width; i; i--)
- {
- png_byte v[8];
- int j;
- png_memcpy(v, sptr, pixel_bytes);
- for (j = 0; j < png_pass_inc[pass]; j++)
- {
- png_memcpy(dp, v, pixel_bytes);
- dp -= pixel_bytes;
- }
- sptr -= pixel_bytes;
- }
- }
- } /* end if (MMX not supported) */
- break;
- }
- } /* end switch (row_info->pixel_depth) */
- row_info->width = final_width;
- row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
- }
- } /* end png_do_read_interlace() */
- #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
- #endif /* PNG_READ_INTERLACING_SUPPORTED */
- #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
- #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
- // These variables are utilized in the functions below. They are declared
- // globally here to ensure alignment on 8-byte boundaries.
- union uAll {
- long long use;
- double align;
- } _LBCarryMask = {0x0101010101010101LL},
- _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
- _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
- #ifdef PNG_THREAD_UNSAFE_OK
- //===========================================================================//
- // //
- // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
- // //
- //===========================================================================//
- // Optimized code for PNG Average filter decoder
- static void /* PRIVATE */
- png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
- png_bytep prev_row)
- {
- int bpp;
- int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
- int dummy_value_S;
- int dummy_value_D;
- bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
- _FullLength = row_info->rowbytes; // # of bytes to filter
- __asm__ __volatile__ (
- // initialize address pointers and offset
- #ifdef __PIC__
- "pushl %%ebx nt" // save index to Global Offset Table
- #endif
- //pre "movl row, %%edi nt" // edi: Avg(x)
- "xorl %%ebx, %%ebx nt" // ebx: x
- "movl %%edi, %%edx nt"
- //pre "movl prev_row, %%esi nt" // esi: Prior(x)
- //pre "subl bpp, %%edx nt" // (bpp is preloaded into ecx)
- "subl %%ecx, %%edx nt" // edx: Raw(x-bpp)
- "xorl %%eax,%%eax nt"
- // Compute the Raw value for the first bpp bytes
- // Raw(x) = Avg(x) + (Prior(x)/2)
- "avg_rlp: nt"
- "movb (%%esi,%%ebx,),%%al nt" // load al with Prior(x)
- "incl %%ebx nt"
- "shrb %%al nt" // divide by 2
- "addb -1(%%edi,%%ebx,),%%al nt" // add Avg(x); -1 to offset inc ebx
- //pre "cmpl bpp, %%ebx nt" // (bpp is preloaded into ecx)
- "cmpl %%ecx, %%ebx nt"
- "movb %%al,-1(%%edi,%%ebx,) nt" // write Raw(x); -1 to offset inc ebx
- "jb avg_rlp nt" // mov does not affect flags
- // get # of bytes to alignment
- "movl %%edi, _dif nt" // take start of row
- "addl %%ebx, _dif nt" // add bpp
- "addl $0xf, _dif nt" // add 7+8 to incr past alignment bdry
- "andl $0xfffffff8, _dif nt" // mask to alignment boundary
- "subl %%edi, _dif nt" // subtract from start => value ebx at
- "jz avg_go nt" // alignment
- // fix alignment
- // Compute the Raw value for the bytes up to the alignment boundary
- // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
- "xorl %%ecx, %%ecx nt"
- "avg_lp1: nt"
- "xorl %%eax, %%eax nt"
- "movb (%%esi,%%ebx,), %%cl nt" // load cl with Prior(x)
- "movb (%%edx,%%ebx,), %%al nt" // load al with Raw(x-bpp)
- "addw %%cx, %%ax nt"
- "incl %%ebx nt"
- "shrw %%ax nt" // divide by 2
- "addb -1(%%edi,%%ebx,), %%al nt" // add Avg(x); -1 to offset inc ebx
- "cmpl _dif, %%ebx nt" // check if at alignment boundary
- "movb %%al, -1(%%edi,%%ebx,) nt" // write Raw(x); -1 to offset inc ebx
- "jb avg_lp1 nt" // repeat until at alignment boundary
- "avg_go: nt"
- "movl _FullLength, %%eax nt"
- "movl %%eax, %%ecx nt"
- "subl %%ebx, %%eax nt" // subtract alignment fix
- "andl $0x00000007, %%eax nt" // calc bytes over mult of 8
- "subl %%eax, %%ecx nt" // drop over bytes from original length
- "movl %%ecx, _MMXLength nt"
- #ifdef __PIC__
- "popl %%ebx nt" // restore index to Global Offset Table
- #endif
- : "=c" (dummy_value_c), // output regs (dummy)
- "=S" (dummy_value_S),
- "=D" (dummy_value_D)
- : "0" (bpp), // ecx // input regs
- "1" (prev_row), // esi
- "2" (row) // edi
- : "%eax", "%edx" // clobber list
- #ifndef __PIC__
- , "%ebx"
- #endif
- // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
- // (seems to work fine without...)
- );
- // now do the math for the rest of the row
- switch (bpp)
- {
- case 3:
- {
- _ActiveMask.use = 0x0000000000ffffffLL;
- _ShiftBpp.use = 24; // == 3 * 8
- _ShiftRem.use = 40; // == 64 - 24
- __asm__ __volatile__ (
- // re-init address pointers and offset
- "movq _ActiveMask, %%mm7 nt"
- "movl _dif, %%ecx nt" // ecx: x = offset to
- "movq _LBCarryMask, %%mm5 nt" // alignment boundary
- // preload "movl row, %%edi nt" // edi: Avg(x)
- "movq _HBClearMask, %%mm4 nt"
- // preload "movl prev_row, %%esi nt" // esi: Prior(x)
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm2 nt" // load previous aligned 8 bytes
- // (correct pos. in loop below)
- "avg_3lp: nt"
- "movq (%%edi,%%ecx,), %%mm0 nt" // load mm0 with Avg(x)
- "movq %%mm5, %%mm3 nt"
- "psrlq _ShiftRem, %%mm2 nt" // correct position Raw(x-bpp)
- // data
- "movq (%%esi,%%ecx,), %%mm1 nt" // load mm1 with Prior(x)
- "movq %%mm7, %%mm6 nt"
- "pand %%mm1, %%mm3 nt" // get lsb for each prev_row byte
- "psrlq $1, %%mm1 nt" // divide prev_row bytes by 2
- "pand %%mm4, %%mm1 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm0 nt" // add (Prev_row/2) to Avg for
- // each byte
- // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
- "movq %%mm3, %%mm1 nt" // now use mm1 for getting
- // LBCarrys
- "pand %%mm2, %%mm1 nt" // get LBCarrys for each byte
- // where both
- // lsb's were == 1 (only valid for active group)
- "psrlq $1, %%mm2 nt" // divide raw bytes by 2
- "pand %%mm4, %%mm2 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm2 nt" // add LBCarrys to (Raw(x-bpp)/2)
- // for each byte
- "pand %%mm6, %%mm2 nt" // leave only Active Group 1
- // bytes to add to Avg
- "paddb %%mm2, %%mm0 nt" // add (Raw/2) + LBCarrys to
- // Avg for each Active
- // byte
- // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
- "psllq _ShiftBpp, %%mm6 nt" // shift the mm6 mask to cover
- // bytes 3-5
- "movq %%mm0, %%mm2 nt" // mov updated Raws to mm2
- "psllq _ShiftBpp, %%mm2 nt" // shift data to pos. correctly
- "movq %%mm3, %%mm1 nt" // now use mm1 for getting
- // LBCarrys
- "pand %%mm2, %%mm1 nt" // get LBCarrys for each byte
- // where both
- // lsb's were == 1 (only valid for active group)
- "psrlq $1, %%mm2 nt" // divide raw bytes by 2
- "pand %%mm4, %%mm2 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm2 nt" // add LBCarrys to (Raw(x-bpp)/2)
- // for each byte
- "pand %%mm6, %%mm2 nt" // leave only Active Group 2
- // bytes to add to Avg
- "paddb %%mm2, %%mm0 nt" // add (Raw/2) + LBCarrys to
- // Avg for each Active
- // byte
- // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
- "psllq _ShiftBpp, %%mm6 nt" // shift mm6 mask to cover last
- // two
- // bytes
- "movq %%mm0, %%mm2 nt" // mov updated Raws to mm2
- "psllq _ShiftBpp, %%mm2 nt" // shift data to pos. correctly
- // Data only needs to be shifted once here to
- // get the correct x-bpp offset.
- "movq %%mm3, %%mm1 nt" // now use mm1 for getting
- // LBCarrys
- "pand %%mm2, %%mm1 nt" // get LBCarrys for each byte
- // where both
- // lsb's were == 1 (only valid for active group)
- "psrlq $1, %%mm2 nt" // divide raw bytes by 2
- "pand %%mm4, %%mm2 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm2 nt" // add LBCarrys to (Raw(x-bpp)/2)
- // for each byte
- "pand %%mm6, %%mm2 nt" // leave only Active Group 2
- // bytes to add to Avg
- "addl $8, %%ecx nt"
- "paddb %%mm2, %%mm0 nt" // add (Raw/2) + LBCarrys to
- // Avg for each Active
- // byte
- // now ready to write back to memory
- "movq %%mm0, -8(%%edi,%%ecx,) nt"
- // move updated Raw(x) to use as Raw(x-bpp) for next loop
- "cmpl _MMXLength, %%ecx nt"
- "movq %%mm0, %%mm2 nt" // mov updated Raw(x) to mm2
- "jb avg_3lp nt"
- : "=S" (dummy_value_S), // output regs (dummy)
- "=D" (dummy_value_D)
- : "0" (prev_row), // esi // input regs
- "1" (row) // edi
- : "%ecx" // clobber list
- #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm2", "%mm3"
- , "%mm4", "%mm5", "%mm6", "%mm7"
- #endif
- );
- }
- break; // end 3 bpp
- case 6:
- case 4:
- //case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
- //case 5: // GRR BOGUS
- {
- _ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
- // appropriate inactive bytes
- _ShiftBpp.use = bpp << 3;
- _ShiftRem.use = 64 - _ShiftBpp.use;
- __asm__ __volatile__ (
- "movq _HBClearMask, %%mm4 nt"
- // re-init address pointers and offset
- "movl _dif, %%ecx nt" // ecx: x = offset to
- // alignment boundary
- // load _ActiveMask and clear all bytes except for 1st active group
- "movq _ActiveMask, %%mm7 nt"
- // preload "movl row, %%edi nt" // edi: Avg(x)
- "psrlq _ShiftRem, %%mm7 nt"
- // preload "movl prev_row, %%esi nt" // esi: Prior(x)
- "movq %%mm7, %%mm6 nt"
- "movq _LBCarryMask, %%mm5 nt"
- "psllq _ShiftBpp, %%mm6 nt" // create mask for 2nd active
- // group
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm2 nt" // load previous aligned 8 bytes
- // (we correct pos. in loop below)
- "avg_4lp: nt"
- "movq (%%edi,%%ecx,), %%mm0 nt"
- "psrlq _ShiftRem, %%mm2 nt" // shift data to pos. correctly
- "movq (%%esi,%%ecx,), %%mm1 nt"
- // add (Prev_row/2) to average
- "movq %%mm5, %%mm3 nt"
- "pand %%mm1, %%mm3 nt" // get lsb for each prev_row byte
- "psrlq $1, %%mm1 nt" // divide prev_row bytes by 2
- "pand %%mm4, %%mm1 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm0 nt" // add (Prev_row/2) to Avg for
- // each byte
- // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
- "movq %%mm3, %%mm1 nt" // now use mm1 for getting
- // LBCarrys
- "pand %%mm2, %%mm1 nt" // get LBCarrys for each byte
- // where both
- // lsb's were == 1 (only valid for active group)
- "psrlq $1, %%mm2 nt" // divide raw bytes by 2
- "pand %%mm4, %%mm2 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm2 nt" // add LBCarrys to (Raw(x-bpp)/2)
- // for each byte
- "pand %%mm7, %%mm2 nt" // leave only Active Group 1
- // bytes to add to Avg
- "paddb %%mm2, %%mm0 nt" // add (Raw/2) + LBCarrys to Avg
- // for each Active
- // byte
- // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
- "movq %%mm0, %%mm2 nt" // mov updated Raws to mm2
- "psllq _ShiftBpp, %%mm2 nt" // shift data to pos. correctly
- "addl $8, %%ecx nt"
- "movq %%mm3, %%mm1 nt" // now use mm1 for getting
- // LBCarrys
- "pand %%mm2, %%mm1 nt" // get LBCarrys for each byte
- // where both
- // lsb's were == 1 (only valid for active group)
- "psrlq $1, %%mm2 nt" // divide raw bytes by 2
- "pand %%mm4, %%mm2 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm2 nt" // add LBCarrys to (Raw(x-bpp)/2)
- // for each byte
- "pand %%mm6, %%mm2 nt" // leave only Active Group 2
- // bytes to add to Avg
- "paddb %%mm2, %%mm0 nt" // add (Raw/2) + LBCarrys to
- // Avg for each Active
- // byte
- "cmpl _MMXLength, %%ecx nt"
- // now ready to write back to memory
- "movq %%mm0, -8(%%edi,%%ecx,) nt"
- // prep Raw(x-bpp) for next loop
- "movq %%mm0, %%mm2 nt" // mov updated Raws to mm2
- "jb avg_4lp nt"
- : "=S" (dummy_value_S), // output regs (dummy)
- "=D" (dummy_value_D)
- : "0" (prev_row), // esi // input regs
- "1" (row) // edi
- : "%ecx" // clobber list
- #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm2", "%mm3"
- , "%mm4", "%mm5", "%mm6", "%mm7"
- #endif
- );
- }
- break; // end 4,6 bpp
- case 2:
- {
- _ActiveMask.use = 0x000000000000ffffLL;
- _ShiftBpp.use = 16; // == 2 * 8
- _ShiftRem.use = 48; // == 64 - 16
- __asm__ __volatile__ (
- // load _ActiveMask
- "movq _ActiveMask, %%mm7 nt"
- // re-init address pointers and offset
- "movl _dif, %%ecx nt" // ecx: x = offset to alignment
- // boundary
- "movq _LBCarryMask, %%mm5 nt"
- // preload "movl row, %%edi nt" // edi: Avg(x)
- "movq _HBClearMask, %%mm4 nt"
- // preload "movl prev_row, %%esi nt" // esi: Prior(x)
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm2 nt" // load previous aligned 8 bytes
- // (we correct pos. in loop below)
- "avg_2lp: nt"
- "movq (%%edi,%%ecx,), %%mm0 nt"
- "psrlq _ShiftRem, %%mm2 nt" // shift data to pos. correctly
- "movq (%%esi,%%ecx,), %%mm1 nt" // (GRR BUGFIX: was psllq)
- // add (Prev_row/2) to average
- "movq %%mm5, %%mm3 nt"
- "pand %%mm1, %%mm3 nt" // get lsb for each prev_row byte
- "psrlq $1, %%mm1 nt" // divide prev_row bytes by 2
- "pand %%mm4, %%mm1 nt" // clear invalid bit 7 of each
- // byte
- "movq %%mm7, %%mm6 nt"
- "paddb %%mm1, %%mm0 nt" // add (Prev_row/2) to Avg for
- // each byte
- // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
- "movq %%mm3, %%mm1 nt" // now use mm1 for getting
- // LBCarrys
- "pand %%mm2, %%mm1 nt" // get LBCarrys for each byte
- // where both
- // lsb's were == 1 (only valid
- // for active group)
- "psrlq $1, %%mm2 nt" // divide raw bytes by 2
- "pand %%mm4, %%mm2 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm2 nt" // add LBCarrys to (Raw(x-bpp)/2)
- // for each byte
- "pand %%mm6, %%mm2 nt" // leave only Active Group 1
- // bytes to add to Avg
- "paddb %%mm2, %%mm0 nt" // add (Raw/2) + LBCarrys to Avg
- // for each Active byte
- // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
- "psllq _ShiftBpp, %%mm6 nt" // shift the mm6 mask to cover
- // bytes 2 & 3
- "movq %%mm0, %%mm2 nt" // mov updated Raws to mm2
- "psllq _ShiftBpp, %%mm2 nt" // shift data to pos. correctly
- "movq %%mm3, %%mm1 nt" // now use mm1 for getting
- // LBCarrys
- "pand %%mm2, %%mm1 nt" // get LBCarrys for each byte
- // where both
- // lsb's were == 1 (only valid
- // for active group)
- "psrlq $1, %%mm2 nt" // divide raw bytes by 2
- "pand %%mm4, %%mm2 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm2 nt" // add LBCarrys to (Raw(x-bpp)/2)
- // for each byte
- "pand %%mm6, %%mm2 nt" // leave only Active Group 2
- // bytes to add to Avg
- "paddb %%mm2, %%mm0 nt" // add (Raw/2) + LBCarrys to
- // Avg for each Active byte
- // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
- "psllq _ShiftBpp, %%mm6 nt" // shift the mm6 mask to cover
- // bytes 4 & 5
- "movq %%mm0, %%mm2 nt" // mov updated Raws to mm2
- "psllq _ShiftBpp, %%mm2 nt" // shift data to pos. correctly
- "movq %%mm3, %%mm1 nt" // now use mm1 for getting
- // LBCarrys
- "pand %%mm2, %%mm1 nt" // get LBCarrys for each byte
- // where both lsb's were == 1
- // (only valid for active group)
- "psrlq $1, %%mm2 nt" // divide raw bytes by 2
- "pand %%mm4, %%mm2 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm2 nt" // add LBCarrys to (Raw(x-bpp)/2)
- // for each byte
- "pand %%mm6, %%mm2 nt" // leave only Active Group 2
- // bytes to add to Avg
- "paddb %%mm2, %%mm0 nt" // add (Raw/2) + LBCarrys to
- // Avg for each Active byte
- // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
- "psllq _ShiftBpp, %%mm6 nt" // shift the mm6 mask to cover
- // bytes 6 & 7
- "movq %%mm0, %%mm2 nt" // mov updated Raws to mm2
- "psllq _ShiftBpp, %%mm2 nt" // shift data to pos. correctly
- "addl $8, %%ecx nt"
- "movq %%mm3, %%mm1 nt" // now use mm1 for getting
- // LBCarrys
- "pand %%mm2, %%mm1 nt" // get LBCarrys for each byte
- // where both
- // lsb's were == 1 (only valid
- // for active group)
- "psrlq $1, %%mm2 nt" // divide raw bytes by 2
- "pand %%mm4, %%mm2 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm2 nt" // add LBCarrys to (Raw(x-bpp)/2)
- // for each byte
- "pand %%mm6, %%mm2 nt" // leave only Active Group 2
- // bytes to add to Avg
- "paddb %%mm2, %%mm0 nt" // add (Raw/2) + LBCarrys to
- // Avg for each Active byte
- "cmpl _MMXLength, %%ecx nt"
- // now ready to write back to memory
- "movq %%mm0, -8(%%edi,%%ecx,) nt"
- // prep Raw(x-bpp) for next loop
- "movq %%mm0, %%mm2 nt" // mov updated Raws to mm2
- "jb avg_2lp nt"
- : "=S" (dummy_value_S), // output regs (dummy)
- "=D" (dummy_value_D)
- : "0" (prev_row), // esi // input regs
- "1" (row) // edi
- : "%ecx" // clobber list
- #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm2", "%mm3"
- , "%mm4", "%mm5", "%mm6", "%mm7"
- #endif
- );
- }
- break; // end 2 bpp
- case 1:
- {
- __asm__ __volatile__ (
- // re-init address pointers and offset
- #ifdef __PIC__
- "pushl %%ebx nt" // save Global Offset Table index
- #endif
- "movl _dif, %%ebx nt" // ebx: x = offset to alignment
- // boundary
- // preload "movl row, %%edi nt" // edi: Avg(x)
- "cmpl _FullLength, %%ebx nt" // test if offset at end of array
- "jnb avg_1end nt"
- // do Paeth decode for remaining bytes
- // preload "movl prev_row, %%esi nt" // esi: Prior(x)
- "movl %%edi, %%edx nt"
- // preload "subl bpp, %%edx nt" // (bpp is preloaded into ecx)
- "subl %%ecx, %%edx nt" // edx: Raw(x-bpp)
- "xorl %%ecx, %%ecx nt" // zero ecx before using cl & cx
- // in loop below
- "avg_1lp: nt"
- // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
- "xorl %%eax, %%eax nt"
- "movb (%%esi,%%ebx,), %%cl nt" // load cl with Prior(x)
- "movb (%%edx,%%ebx,), %%al nt" // load al with Raw(x-bpp)
- "addw %%cx, %%ax nt"
- "incl %%ebx nt"
- "shrw %%ax nt" // divide by 2
- "addb -1(%%edi,%%ebx,), %%al nt" // add Avg(x); -1 to offset
- // inc ebx
- "cmpl _FullLength, %%ebx nt" // check if at end of array
- "movb %%al, -1(%%edi,%%ebx,) nt" // write back Raw(x);
- // mov does not affect flags; -1 to offset inc ebx
- "jb avg_1lp nt"
- "avg_1end: nt"
- #ifdef __PIC__
- "popl %%ebx nt" // Global Offset Table index
- #endif
- : "=c" (dummy_value_c), // output regs (dummy)
- "=S" (dummy_value_S),
- "=D" (dummy_value_D)
- : "0" (bpp), // ecx // input regs
- "1" (prev_row), // esi
- "2" (row) // edi
- : "%eax", "%edx" // clobber list
- #ifndef __PIC__
- , "%ebx"
- #endif
- );
- }
- return; // end 1 bpp
- case 8:
- {
- __asm__ __volatile__ (
- // re-init address pointers and offset
- "movl _dif, %%ecx nt" // ecx: x == offset to alignment
- "movq _LBCarryMask, %%mm5 nt" // boundary
- // preload "movl row, %%edi nt" // edi: Avg(x)
- "movq _HBClearMask, %%mm4 nt"
- // preload "movl prev_row, %%esi nt" // esi: Prior(x)
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm2 nt" // load previous aligned 8 bytes
- // (NO NEED to correct pos. in loop below)
- "avg_8lp: nt"
- "movq (%%edi,%%ecx,), %%mm0 nt"
- "movq %%mm5, %%mm3 nt"
- "movq (%%esi,%%ecx,), %%mm1 nt"
- "addl $8, %%ecx nt"
- "pand %%mm1, %%mm3 nt" // get lsb for each prev_row byte
- "psrlq $1, %%mm1 nt" // divide prev_row bytes by 2
- "pand %%mm2, %%mm3 nt" // get LBCarrys for each byte
- // where both lsb's were == 1
- "psrlq $1, %%mm2 nt" // divide raw bytes by 2
- "pand %%mm4, %%mm1 nt" // clear invalid bit 7, each byte
- "paddb %%mm3, %%mm0 nt" // add LBCarrys to Avg, each byte
- "pand %%mm4, %%mm2 nt" // clear invalid bit 7, each byte
- "paddb %%mm1, %%mm0 nt" // add (Prev_row/2) to Avg, each
- "paddb %%mm2, %%mm0 nt" // add (Raw/2) to Avg for each
- "cmpl _MMXLength, %%ecx nt"
- "movq %%mm0, -8(%%edi,%%ecx,) nt"
- "movq %%mm0, %%mm2 nt" // reuse as Raw(x-bpp)
- "jb avg_8lp nt"
- : "=S" (dummy_value_S), // output regs (dummy)
- "=D" (dummy_value_D)
- : "0" (prev_row), // esi // input regs
- "1" (row) // edi
- : "%ecx" // clobber list
- #if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm2"
- , "%mm3", "%mm4", "%mm5"
- #endif
- );
- }
- break; // end 8 bpp
- default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
- {
- #ifdef PNG_DEBUG
- // GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
- png_debug(1,
- "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())n");
- #endif
- #if 0
- __asm__ __volatile__ (
- "movq _LBCarryMask, %%mm5 nt"
- // re-init address pointers and offset
- "movl _dif, %%ebx nt" // ebx: x = offset to
- // alignment boundary
- "movl row, %%edi nt" // edi: Avg(x)
- "movq _HBClearMask, %%mm4 nt"
- "movl %%edi, %%edx nt"
- "movl prev_row, %%esi nt" // esi: Prior(x)
- "subl bpp, %%edx nt" // edx: Raw(x-bpp)
- "avg_Alp: nt"
- "movq (%%edi,%%ebx,), %%mm0 nt"
- "movq %%mm5, %%mm3 nt"
- "movq (%%esi,%%ebx,), %%mm1 nt"
- "pand %%mm1, %%mm3 nt" // get lsb for each prev_row byte
- "movq (%%edx,%%ebx,), %%mm2 nt"
- "psrlq $1, %%mm1 nt" // divide prev_row bytes by 2
- "pand %%mm2, %%mm3 nt" // get LBCarrys for each byte
- // where both lsb's were == 1
- "psrlq $1, %%mm2 nt" // divide raw bytes by 2
- "pand %%mm4, %%mm1 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm3, %%mm0 nt" // add LBCarrys to Avg for each
- // byte
- "pand %%mm4, %%mm2 nt" // clear invalid bit 7 of each
- // byte
- "paddb %%mm1, %%mm0 nt" // add (Prev_row/2) to Avg for
- // each byte
- "addl $8, %%ebx nt"
- "paddb %%mm2, %%mm0 nt" // add (Raw/2) to Avg for each
- // byte
- "cmpl _MMXLength, %%ebx nt"
- "movq %%mm0, -8(%%edi,%%ebx,) nt"
- "jb avg_Alp nt"
- : // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
- : // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
- : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
- );
- #endif /* 0 - NEVER REACHED */
- }
- break;
- } // end switch (bpp)
- __asm__ __volatile__ (
- // MMX acceleration complete; now do clean-up
- // check if any remaining bytes left to decode
- #ifdef __PIC__
- "pushl %%ebx nt" // save index to Global Offset Table
- #endif
- "movl _MMXLength, %%ebx nt" // ebx: x == offset bytes after MMX
- //pre "movl row, %%edi nt" // edi: Avg(x)
- "cmpl _FullLength, %%ebx nt" // test if offset at end of array
- "jnb avg_end nt"
- // do Avg decode for remaining bytes
- //pre "movl prev_row, %%esi nt" // esi: Prior(x)
- "movl %%edi, %%edx nt"
- //pre "subl bpp, %%edx nt" // (bpp is preloaded into ecx)
- "subl %%ecx, %%edx nt" // edx: Raw(x-bpp)
- "xorl %%ecx, %%ecx nt" // zero ecx before using cl & cx below
- "avg_lp2: nt"
- // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
- "xorl %%eax, %%eax nt"
- "movb (%%esi,%%ebx,), %%cl nt" // load cl with Prior(x)
- "movb (%%edx,%%ebx,), %%al nt" // load al with Raw(x-bpp)
- "addw %%cx, %%ax nt"
- "incl %%ebx nt"
- "shrw %%ax nt" // divide by 2
- "addb -1(%%edi,%%ebx,), %%al nt" // add Avg(x); -1 to offset inc ebx
- "cmpl _FullLength, %%ebx nt" // check if at end of array
- "movb %%al, -1(%%edi,%%ebx,) nt" // write back Raw(x) [mov does not
- "jb avg_lp2 nt" // affect flags; -1 to offset inc ebx]
- "avg_end: nt"
- "EMMS nt" // end MMX; prep for poss. FP instrs.
- #ifdef __PIC__
- "popl %%ebx nt" // restore index to Global Offset Table
- #endif
- : "=c" (dummy_value_c), // output regs (dummy)
- "=S" (dummy_value_S),
- "=D" (dummy_value_D)
- : "0" (bpp), // ecx // input regs
- "1" (prev_row), // esi
- "2" (row) // edi
- : "%eax", "%edx" // clobber list
- #ifndef __PIC__
- , "%ebx"
- #endif
- );
- } /* end png_read_filter_row_mmx_avg() */
- #endif
- #ifdef PNG_THREAD_UNSAFE_OK
- //===========================================================================//
- // //
- // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
- // //
- //===========================================================================//
- // Optimized code for PNG Paeth filter decoder
- static void /* PRIVATE */
- png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
- png_bytep prev_row)
- {
- int bpp;
- int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
- int dummy_value_S;
- int dummy_value_D;
- bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
- _FullLength = row_info->rowbytes; // # of bytes to filter
- __asm__ __volatile__ (
- #ifdef __PIC__
- "pushl %%ebx nt" // save index to Global Offset Table
- #endif
- "xorl %%ebx, %%ebx nt" // ebx: x offset
- //pre "movl row, %%edi nt"
- "xorl %%edx, %%edx nt" // edx: x-bpp offset
- //pre "movl prev_row, %%esi nt"
- "xorl %%eax, %%eax nt"
- // Compute the Raw value for the first bpp bytes
- // Note: the formula works out to be always
- // Paeth(x) = Raw(x) + Prior(x) where x < bpp
- "paeth_rlp: nt"
- "movb (%%edi,%%ebx,), %%al nt"
- "addb (%%esi,%%ebx,), %%al nt"
- "incl %%ebx nt"
- //pre "cmpl bpp, %%ebx nt" (bpp is preloaded into ecx)
- "cmpl %%ecx, %%ebx nt"
- "movb %%al, -1(%%edi,%%ebx,) nt"
- "jb paeth_rlp nt"
- // get # of bytes to alignment
- "movl %%edi, _dif nt" // take start of row
- "addl %%ebx, _dif nt" // add bpp
- "xorl %%ecx, %%ecx nt"
- "addl $0xf, _dif nt" // add 7 + 8 to incr past alignment
- // boundary
- "andl $0xfffffff8, _dif nt" // mask to alignment boundary
- "subl %%edi, _dif nt" // subtract from start ==> value ebx
- // at alignment
- "jz paeth_go nt"
- // fix alignment
- "paeth_lp1: nt"
- "xorl %%eax, %%eax nt"
- // pav = p - a = (a + b - c) - a = b - c
- "movb (%%esi,%%ebx,), %%al nt" // load Prior(x) into al
- "movb (%%esi,%%edx,), %%cl nt" // load Prior(x-bpp) into cl
- "subl %%ecx, %%eax nt" // subtract Prior(x-bpp)
- "movl %%eax, _patemp nt" // Save pav for later use
- "xorl %%eax, %%eax nt"
- // pbv = p - b = (a + b - c) - b = a - c
- "movb (%%edi,%%edx,), %%al nt" // load Raw(x-bpp) into al
- "subl %%ecx, %%eax nt" // subtract Prior(x-bpp)
- "movl %%eax, %%ecx nt"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
- "addl _patemp, %%eax nt" // pcv = pav + pbv
- // pc = abs(pcv)
- "testl $0x80000000, %%eax nt"
- "jz paeth_pca nt"
- "negl %%eax nt" // reverse sign of neg values
- "paeth_pca: nt"
- "movl %%eax, _pctemp nt" // save pc for later use
- // pb = abs(pbv)
- "testl $0x80000000, %%ecx nt"
- "jz paeth_pba nt"
- "negl %%ecx nt" // reverse sign of neg values
- "paeth_pba: nt"
- "movl %%ecx, _pbtemp nt" // save pb for later use
- // pa = abs(pav)
- "movl _patemp, %%eax nt"
- "testl $0x80000000, %%eax nt"
- "jz paeth_paa nt"
- "negl %%eax nt" // reverse sign of neg values
- "paeth_paa: nt"
- "movl %%eax, _patemp nt" // save pa for later use
- // test if pa <= pb
- "cmpl %%ecx, %%eax nt"
- "jna paeth_abb nt"
- // pa > pb; now test if pb <= pc
- "cmpl _pctemp, %%ecx nt"
- "jna paeth_bbc nt"
- // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
- "movb (%%esi,%%edx,), %%cl nt" // load Prior(x-bpp) into cl
- "jmp paeth_paeth nt"
- "paeth_bbc: nt"
- // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
- "movb (%%esi,%%ebx,), %%cl nt" // load Prior(x) into cl
- "jmp paeth_paeth nt"
- "paeth_abb: nt"
- // pa <= pb; now test if pa <= pc
- "cmpl _pctemp, %%eax nt"
- "jna paeth_abc nt"
- // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
- "movb (%%esi,%%edx,), %%cl nt" // load Prior(x-bpp) into cl
- "jmp paeth_paeth nt"
- "paeth_abc: nt"
- // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
- "movb (%%edi,%%edx,), %%cl nt" // load Raw(x-bpp) into cl
- "paeth_paeth: nt"
- "incl %%ebx nt"
- "incl %%edx nt"
- // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
- "addb %%cl, -1(%%edi,%%ebx,) nt"
- "cmpl _dif, %%ebx nt"
- "jb paeth_lp1 nt"
- "paeth_go: nt"
- "movl _FullLength, %%ecx nt"
- "movl %%ecx, %%eax nt"
- "subl %%ebx, %%eax nt" // subtract alignment fix
- "andl $0x00000007, %%eax nt" // calc bytes over mult of 8
- "subl %%eax, %%ecx nt" // drop over bytes from original length
- "movl %%ecx, _MMXLength nt"
- #ifdef __PIC__
- "popl %%ebx nt" // restore index to Global Offset Table
- #endif
- : "=c" (dummy_value_c), // output regs (dummy)
- "=S" (dummy_value_S),
- "=D" (dummy_value_D)
- : "0" (bpp), // ecx // input regs
- "1" (prev_row), // esi
- "2" (row) // edi
- : "%eax", "%edx" // clobber list
- #ifndef __PIC__
- , "%ebx"
- #endif
- );
- // now do the math for the rest of the row
- switch (bpp)
- {
- case 3:
- {
- _ActiveMask.use = 0x0000000000ffffffLL;
- _ActiveMaskEnd.use = 0xffff000000000000LL;
- _ShiftBpp.use = 24; // == bpp(3) * 8
- _ShiftRem.use = 40; // == 64 - 24
- __asm__ __volatile__ (
- "movl _dif, %%ecx nt"
- // preload "movl row, %%edi nt"
- // preload "movl prev_row, %%esi nt"
- "pxor %%mm0, %%mm0 nt"
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm1 nt"
- "paeth_3lp: nt"
- "psrlq _ShiftRem, %%mm1 nt" // shift last 3 bytes to 1st
- // 3 bytes
- "movq (%%esi,%%ecx,), %%mm2 nt" // load b=Prior(x)
- "punpcklbw %%mm0, %%mm1 nt" // unpack High bytes of a
- "movq -8(%%esi,%%ecx,), %%mm3 nt" // prep c=Prior(x-bpp) bytes
- "punpcklbw %%mm0, %%mm2 nt" // unpack High bytes of b
- "psrlq _ShiftRem, %%mm3 nt" // shift last 3 bytes to 1st
- // 3 bytes
- // pav = p - a = (a + b - c) - a = b - c
- "movq %%mm2, %%mm4 nt"
- "punpcklbw %%mm0, %%mm3 nt" // unpack High bytes of c
- // pbv = p - b = (a + b - c) - b = a - c
- "movq %%mm1, %%mm5 nt"
- "psubw %%mm3, %%mm4 nt"
- "pxor %%mm7, %%mm7 nt"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
- "movq %%mm4, %%mm6 nt"
- "psubw %%mm3, %%mm5 nt"
- // pa = abs(p-a) = abs(pav)
- // pb = abs(p-b) = abs(pbv)
- // pc = abs(p-c) = abs(pcv)
- "pcmpgtw %%mm4, %%mm0 nt" // create mask pav bytes < 0
- "paddw %%mm5, %%mm6 nt"
- "pand %%mm4, %%mm0 nt" // only pav bytes < 0 in mm7
- "pcmpgtw %%mm5, %%mm7 nt" // create mask pbv bytes < 0
- "psubw %%mm0, %%mm4 nt"
- "pand %%mm5, %%mm7 nt" // only pbv bytes < 0 in mm0
- "psubw %%mm0, %%mm4 nt"
- "psubw %%mm7, %%mm5 nt"
- "pxor %%mm0, %%mm0 nt"
- "pcmpgtw %%mm6, %%mm0 nt" // create mask pcv bytes < 0
- "pand %%mm6, %%mm0 nt" // only pav bytes < 0 in mm7
- "psubw %%mm7, %%mm5 nt"
- "psubw %%mm0, %%mm6 nt"
- // test pa <= pb
- "movq %%mm4, %%mm7 nt"
- "psubw %%mm0, %%mm6 nt"
- "pcmpgtw %%mm5, %%mm7 nt" // pa > pb?
- "movq %%mm7, %%mm0 nt"
- // use mm7 mask to merge pa & pb
- "pand %%mm7, %%mm5 nt"
- // use mm0 mask copy to merge a & b
- "pand %%mm0, %%mm2 nt"
- "pandn %%mm4, %%mm7 nt"
- "pandn %%mm1, %%mm0 nt"
- "paddw %%mm5, %%mm7 nt"
- "paddw %%mm2, %%mm0 nt"
- // test ((pa <= pb)? pa:pb) <= pc
- "pcmpgtw %%mm6, %%mm7 nt" // pab > pc?
- "pxor %%mm1, %%mm1 nt"
- "pand %%mm7, %%mm3 nt"
- "pandn %%mm0, %%mm7 nt"
- "paddw %%mm3, %%mm7 nt"
- "pxor %%mm0, %%mm0 nt"
- "packuswb %%mm1, %%mm7 nt"
- "movq (%%esi,%%ecx,), %%mm3 nt" // load c=Prior(x-bpp)
- "pand _ActiveMask, %%mm7 nt"
- "movq %%mm3, %%mm2 nt" // load b=Prior(x) step 1
- "paddb (%%edi,%%ecx,), %%mm7 nt" // add Paeth predictor with Raw(x)
- "punpcklbw %%mm0, %%mm3 nt" // unpack High bytes of c
- "movq %%mm7, (%%edi,%%ecx,) nt" // write back updated value
- "movq %%mm7, %%mm1 nt" // now mm1 will be used as
- // Raw(x-bpp)
- // now do Paeth for 2nd set of bytes (3-5)
- "psrlq _ShiftBpp, %%mm2 nt" // load b=Prior(x) step 2
- "punpcklbw %%mm0, %%mm1 nt" // unpack High bytes of a
- "pxor %%mm7, %%mm7 nt"
- "punpcklbw %%mm0, %%mm2 nt" // unpack High bytes of b
- // pbv = p - b = (a + b - c) - b = a - c
- "movq %%mm1, %%mm5 nt"
- // pav = p - a = (a + b - c) - a = b - c
- "movq %%mm2, %%mm4 nt"
- "psubw %%mm3, %%mm5 nt"
- "psubw %%mm3, %%mm4 nt"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
- // pav + pbv = pbv + pav
- "movq %%mm5, %%mm6 nt"
- "paddw %%mm4, %%mm6 nt"
- // pa = abs(p-a) = abs(pav)
- // pb = abs(p-b) = abs(pbv)
- // pc = abs(p-c) = abs(pcv)
- "pcmpgtw %%mm5, %%mm0 nt" // create mask pbv bytes < 0
- "pcmpgtw %%mm4, %%mm7 nt" // create mask pav bytes < 0
- "pand %%mm5, %%mm0 nt" // only pbv bytes < 0 in mm0
- "pand %%mm4, %%mm7 nt" // only pav bytes < 0 in mm7
- "psubw %%mm0, %%mm5 nt"
- "psubw %%mm7, %%mm4 nt"
- "psubw %%mm0, %%mm5 nt"
- "psubw %%mm7, %%mm4 nt"
- "pxor %%mm0, %%mm0 nt"
- "pcmpgtw %%mm6, %%mm0 nt" // create mask pcv bytes < 0
- "pand %%mm6, %%mm0 nt" // only pav bytes < 0 in mm7
- "psubw %%mm0, %%mm6 nt"
- // test pa <= pb
- "movq %%mm4, %%mm7 nt"
- "psubw %%mm0, %%mm6 nt"
- "pcmpgtw %%mm5, %%mm7 nt" // pa > pb?
- "movq %%mm7, %%mm0 nt"
- // use mm7 mask to merge pa & pb
- "pand %%mm7, %%mm5 nt"
- // use mm0 mask copy to merge a & b
- "pand %%mm0, %%mm2 nt"
- "pandn %%mm4, %%mm7 nt"
- "pandn %%mm1, %%mm0 nt"
- "paddw %%mm5, %%mm7 nt"
- "paddw %%mm2, %%mm0 nt"
- // test ((pa <= pb)? pa:pb) <= pc
- "pcmpgtw %%mm6, %%mm7 nt" // pab > pc?
- "movq (%%esi,%%ecx,), %%mm2 nt" // load b=Prior(x)
- "pand %%mm7, %%mm3 nt"
- "pandn %%mm0, %%mm7 nt"
- "pxor %%mm1, %%mm1 nt"
- "paddw %%mm3, %%mm7 nt"
- "pxor %%mm0, %%mm0 nt"
- "packuswb %%mm1, %%mm7 nt"
- "movq %%mm2, %%mm3 nt" // load c=Prior(x-bpp) step 1
- "pand _ActiveMask, %%mm7 nt"
- "punpckhbw %%mm0, %%mm2 nt" // unpack High bytes of b
- "psllq _ShiftBpp, %%mm7 nt" // shift bytes to 2nd group of
- // 3 bytes
- // pav = p - a = (a + b - c) - a = b - c
- "movq %%mm2, %%mm4 nt"
- "paddb (%%edi,%%ecx,), %%mm7 nt" // add Paeth predictor with Raw(x)
- "psllq _ShiftBpp, %%mm3 nt" // load c=Prior(x-bpp) step 2
- "movq %%mm7, (%%edi,%%ecx,) nt" // write back updated value
- "movq %%mm7, %%mm1 nt"
- "punpckhbw %%mm0, %%mm3 nt" // unpack High bytes of c
- "psllq _ShiftBpp, %%mm1 nt" // shift bytes
- // now mm1 will be used as Raw(x-bpp)
- // now do Paeth for 3rd, and final, set of bytes (6-7)
- "pxor %%mm7, %%mm7 nt"
- "punpckhbw %%mm0, %%mm1 nt" // unpack High bytes of a
- "psubw %%mm3, %%mm4 nt"
- // pbv = p - b = (a + b - c) - b = a - c
- "movq %%mm1, %%mm5 nt"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
- "movq %%mm4, %%mm6 nt"
- "psubw %%mm3, %%mm5 nt"
- "pxor %%mm0, %%mm0 nt"
- "paddw %%mm5, %%mm6 nt"
- // pa = abs(p-a) = abs(pav)
- // pb = abs(p-b) = abs(pbv)
- // pc = abs(p-c) = abs(pcv)
- "pcmpgtw %%mm4, %%mm0 nt" // create mask pav bytes < 0
- "pcmpgtw %%mm5, %%mm7 nt" // create mask pbv bytes < 0
- "pand %%mm4, %%mm0 nt" // only pav bytes < 0 in mm7
- "pand %%mm5, %%mm7 nt" // only pbv bytes < 0 in mm0
- "psubw %%mm0, %%mm4 nt"
- "psubw %%mm7, %%mm5 nt"
- "psubw %%mm0, %%mm4 nt"
- "psubw %%mm7, %%mm5 nt"
- "pxor %%mm0, %%mm0 nt"
- "pcmpgtw %%mm6, %%mm0 nt" // create mask pcv bytes < 0
- "pand %%mm6, %%mm0 nt" // only pav bytes < 0 in mm7
- "psubw %%mm0, %%mm6 nt"
- // test pa <= pb
- "movq %%mm4, %%mm7 nt"
- "psubw %%mm0, %%mm6 nt"
- "pcmpgtw %%mm5, %%mm7 nt" // pa > pb?
- "movq %%mm7, %%mm0 nt"
- // use mm0 mask copy to merge a & b
- "pand %%mm0, %%mm2 nt"
- // use mm7 mask to merge pa & pb
- "pand %%mm7, %%mm5 nt"
- "pandn %%mm1, %%mm0 nt"
- "pandn %%mm4, %%mm7 nt"
- "paddw %%mm2, %%mm0 nt"
- "paddw %%mm5, %%mm7 nt"
- // test ((pa <= pb)? pa:pb) <= pc
- "pcmpgtw %%mm6, %%mm7 nt" // pab > pc?
- "pand %%mm7, %%mm3 nt"
- "pandn %%mm0, %%mm7 nt"
- "paddw %%mm3, %%mm7 nt"
- "pxor %%mm1, %%mm1 nt"
- "packuswb %%mm7, %%mm1 nt"
- // step ecx to next set of 8 bytes and repeat loop til done
- "addl $8, %%ecx nt"
- "pand _ActiveMaskEnd, %%mm1 nt"
- "paddb -8(%%edi,%%ecx,), %%mm1 nt" // add Paeth predictor with
- // Raw(x)
- "cmpl _MMXLength, %%ecx nt"
- "pxor %%mm0, %%mm0 nt" // pxor does not affect flags
- "movq %%mm1, -8(%%edi,%%ecx,) nt" // write back updated value
- // mm1 will be used as Raw(x-bpp) next loop
- // mm3 ready to be used as Prior(x-bpp) next loop
- "jb paeth_3lp nt"
- : "=S" (dummy_value_S), // output regs (dummy)
- "=D" (dummy_value_D)
- : "0" (prev_row), // esi // input regs
- "1" (row) // edi
- : "%ecx" // clobber list
- #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm2", "%mm3"
- , "%mm4", "%mm5", "%mm6", "%mm7"
- #endif
- );
- }
- break; // end 3 bpp
- case 6:
- //case 7: // GRR BOGUS
- //case 5: // GRR BOGUS
- {
- _ActiveMask.use = 0x00000000ffffffffLL;
- _ActiveMask2.use = 0xffffffff00000000LL;
- _ShiftBpp.use = bpp << 3; // == bpp * 8
- _ShiftRem.use = 64 - _ShiftBpp.use;
- __asm__ __volatile__ (
- "movl _dif, %%ecx nt"
- // preload "movl row, %%edi nt"
- // preload "movl prev_row, %%esi nt"
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm1 nt"
- "pxor %%mm0, %%mm0 nt"
- "paeth_6lp: nt"
- // must shift to position Raw(x-bpp) data
- "psrlq _ShiftRem, %%mm1 nt"
- // do first set of 4 bytes
- "movq -8(%%esi,%%ecx,), %%mm3 nt" // read c=Prior(x-bpp) bytes
- "punpcklbw %%mm0, %%mm1 nt" // unpack Low bytes of a
- "movq (%%esi,%%ecx,), %%mm2 nt" // load b=Prior(x)
- "punpcklbw %%mm0, %%mm2 nt" // unpack Low bytes of b
- // must shift to position Prior(x-bpp) data
- "psrlq _ShiftRem, %%mm3 nt"
- // pav = p - a = (a + b - c) - a = b - c
- "movq %%mm2, %%mm4 nt"
- "punpcklbw %%mm0, %%mm3 nt" // unpack Low bytes of c
- // pbv = p - b = (a + b - c) - b = a - c
- "movq %%mm1, %%mm5 nt"
- "psubw %%mm3, %%mm4 nt"
- "pxor %%mm7, %%mm7 nt"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
- "movq %%mm4, %%mm6 nt"
- "psubw %%mm3, %%mm5 nt"
- // pa = abs(p-a) = abs(pav)
- // pb = abs(p-b) = abs(pbv)
- // pc = abs(p-c) = abs(pcv)
- "pcmpgtw %%mm4, %%mm0 nt" // create mask pav bytes < 0
- "paddw %%mm5, %%mm6 nt"
- "pand %%mm4, %%mm0 nt" // only pav bytes < 0 in mm7
- "pcmpgtw %%mm5, %%mm7 nt" // create mask pbv bytes < 0
- "psubw %%mm0, %%mm4 nt"
- "pand %%mm5, %%mm7 nt" // only pbv bytes < 0 in mm0
- "psubw %%mm0, %%mm4 nt"
- "psubw %%mm7, %%mm5 nt"
- "pxor %%mm0, %%mm0 nt"
- "pcmpgtw %%mm6, %%mm0 nt" // create mask pcv bytes < 0
- "pand %%mm6, %%mm0 nt" // only pav bytes < 0 in mm7
- "psubw %%mm7, %%mm5 nt"
- "psubw %%mm0, %%mm6 nt"
- // test pa <= pb
- "movq %%mm4, %%mm7 nt"
- "psubw %%mm0, %%mm6 nt"
- "pcmpgtw %%mm5, %%mm7 nt" // pa > pb?
- "movq %%mm7, %%mm0 nt"
- // use mm7 mask to merge pa & pb
- "pand %%mm7, %%mm5 nt"
- // use mm0 mask copy to merge a & b
- "pand %%mm0, %%mm2 nt"
- "pandn %%mm4, %%mm7 nt"
- "pandn %%mm1, %%mm0 nt"
- "paddw %%mm5, %%mm7 nt"
- "paddw %%mm2, %%mm0 nt"
- // test ((pa <= pb)? pa:pb) <= pc
- "pcmpgtw %%mm6, %%mm7 nt" // pab > pc?
- "pxor %%mm1, %%mm1 nt"
- "pand %%mm7, %%mm3 nt"
- "pandn %%mm0, %%mm7 nt"
- "paddw %%mm3, %%mm7 nt"
- "pxor %%mm0, %%mm0 nt"
- "packuswb %%mm1, %%mm7 nt"
- "movq -8(%%esi,%%ecx,), %%mm3 nt" // load c=Prior(x-bpp)
- "pand _ActiveMask, %%mm7 nt"
- "psrlq _ShiftRem, %%mm3 nt"
- "movq (%%esi,%%ecx,), %%mm2 nt" // load b=Prior(x) step 1
- "paddb (%%edi,%%ecx,), %%mm7 nt" // add Paeth predictor and Raw(x)
- "movq %%mm2, %%mm6 nt"
- "movq %%mm7, (%%edi,%%ecx,) nt" // write back updated value
- "movq -8(%%edi,%%ecx,), %%mm1 nt"
- "psllq _ShiftBpp, %%mm6 nt"
- "movq %%mm7, %%mm5 nt"
- "psrlq _ShiftRem, %%mm1 nt"
- "por %%mm6, %%mm3 nt"
- "psllq _ShiftBpp, %%mm5 nt"
- "punpckhbw %%mm0, %%mm3 nt" // unpack High bytes of c
- "por %%mm5, %%mm1 nt"
- // do second set of 4 bytes
- "punpckhbw %%mm0, %%mm2 nt" // unpack High bytes of b
- "punpckhbw %%mm0, %%mm1 nt" // unpack High bytes of a
- // pav = p - a = (a + b - c) - a = b - c
- "movq %%mm2, %%mm4 nt"
- // pbv = p - b = (a + b - c) - b = a - c
- "movq %%mm1, %%mm5 nt"
- "psubw %%mm3, %%mm4 nt"
- "pxor %%mm7, %%mm7 nt"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
- "movq %%mm4, %%mm6 nt"
- "psubw %%mm3, %%mm5 nt"
- // pa = abs(p-a) = abs(pav)
- // pb = abs(p-b) = abs(pbv)
- // pc = abs(p-c) = abs(pcv)
- "pcmpgtw %%mm4, %%mm0 nt" // create mask pav bytes < 0
- "paddw %%mm5, %%mm6 nt"
- "pand %%mm4, %%mm0 nt" // only pav bytes < 0 in mm7
- "pcmpgtw %%mm5, %%mm7 nt" // create mask pbv bytes < 0
- "psubw %%mm0, %%mm4 nt"
- "pand %%mm5, %%mm7 nt" // only pbv bytes < 0 in mm0
- "psubw %%mm0, %%mm4 nt"
- "psubw %%mm7, %%mm5 nt"
- "pxor %%mm0, %%mm0 nt"
- "pcmpgtw %%mm6, %%mm0 nt" // create mask pcv bytes < 0
- "pand %%mm6, %%mm0 nt" // only pav bytes < 0 in mm7
- "psubw %%mm7, %%mm5 nt"
- "psubw %%mm0, %%mm6 nt"
- // test pa <= pb
- "movq %%mm4, %%mm7 nt"
- "psubw %%mm0, %%mm6 nt"
- "pcmpgtw %%mm5, %%mm7 nt" // pa > pb?
- "movq %%mm7, %%mm0 nt"
- // use mm7 mask to merge pa & pb
- "pand %%mm7, %%mm5 nt"
- // use mm0 mask copy to merge a & b
- "pand %%mm0, %%mm2 nt"
- "pandn %%mm4, %%mm7 nt"
- "pandn %%mm1, %%mm0 nt"
- "paddw %%mm5, %%mm7 nt"
- "paddw %%mm2, %%mm0 nt"
- // test ((pa <= pb)? pa:pb) <= pc
- "pcmpgtw %%mm6, %%mm7 nt" // pab > pc?
- "pxor %%mm1, %%mm1 nt"
- "pand %%mm7, %%mm3 nt"
- "pandn %%mm0, %%mm7 nt"
- "pxor %%mm1, %%mm1 nt"
- "paddw %%mm3, %%mm7 nt"
- "pxor %%mm0, %%mm0 nt"
- // step ecx to next set of 8 bytes and repeat loop til done
- "addl $8, %%ecx nt"
- "packuswb %%mm7, %%mm1 nt"
- "paddb -8(%%edi,%%ecx,), %%mm1 nt" // add Paeth predictor with Raw(x)
- "cmpl _MMXLength, %%ecx nt"
- "movq %%mm1, -8(%%edi,%%ecx,) nt" // write back updated value
- // mm1 will be used as Raw(x-bpp) next loop
- "jb paeth_6lp nt"
- : "=S" (dummy_value_S), // output regs (dummy)
- "=D" (dummy_value_D)
- : "0" (prev_row), // esi // input regs
- "1" (row) // edi
- : "%ecx" // clobber list
- #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm2", "%mm3"
- , "%mm4", "%mm5", "%mm6", "%mm7"
- #endif
- );
- }
- break; // end 6 bpp
- case 4:
- {
- _ActiveMask.use = 0x00000000ffffffffLL;
- __asm__ __volatile__ (
- "movl _dif, %%ecx nt"
- // preload "movl row, %%edi nt"
- // preload "movl prev_row, %%esi nt"
- "pxor %%mm0, %%mm0 nt"
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm1 nt" // only time should need to read
- // a=Raw(x-bpp) bytes
- "paeth_4lp: nt"
- // do first set of 4 bytes
- "movq -8(%%esi,%%ecx,), %%mm3 nt" // read c=Prior(x-bpp) bytes
- "punpckhbw %%mm0, %%mm1 nt" // unpack Low bytes of a
- "movq (%%esi,%%ecx,), %%mm2 nt" // load b=Prior(x)
- "punpcklbw %%mm0, %%mm2 nt" // unpack High bytes of b
- // pav = p - a = (a + b - c) - a = b - c
- "movq %%mm2, %%mm4 nt"
- "punpckhbw %%mm0, %%mm3 nt" // unpack High bytes of c
- // pbv = p - b = (a + b - c) - b = a - c
- "movq %%mm1, %%mm5 nt"
- "psubw %%mm3, %%mm4 nt"
- "pxor %%mm7, %%mm7 nt"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
- "movq %%mm4, %%mm6 nt"
- "psubw %%mm3, %%mm5 nt"
- // pa = abs(p-a) = abs(pav)
- // pb = abs(p-b) = abs(pbv)
- // pc = abs(p-c) = abs(pcv)
- "pcmpgtw %%mm4, %%mm0 nt" // create mask pav bytes < 0
- "paddw %%mm5, %%mm6 nt"
- "pand %%mm4, %%mm0 nt" // only pav bytes < 0 in mm7
- "pcmpgtw %%mm5, %%mm7 nt" // create mask pbv bytes < 0
- "psubw %%mm0, %%mm4 nt"
- "pand %%mm5, %%mm7 nt" // only pbv bytes < 0 in mm0
- "psubw %%mm0, %%mm4 nt"
- "psubw %%mm7, %%mm5 nt"
- "pxor %%mm0, %%mm0 nt"
- "pcmpgtw %%mm6, %%mm0 nt" // create mask pcv bytes < 0
- "pand %%mm6, %%mm0 nt" // only pav bytes < 0 in mm7
- "psubw %%mm7, %%mm5 nt"
- "psubw %%mm0, %%mm6 nt"
- // test pa <= pb
- "movq %%mm4, %%mm7 nt"
- "psubw %%mm0, %%mm6 nt"
- "pcmpgtw %%mm5, %%mm7 nt" // pa > pb?
- "movq %%mm7, %%mm0 nt"
- // use mm7 mask to merge pa & pb
- "pand %%mm7, %%mm5 nt"
- // use mm0 mask copy to merge a & b
- "pand %%mm0, %%mm2 nt"
- "pandn %%mm4, %%mm7 nt"
- "pandn %%mm1, %%mm0 nt"
- "paddw %%mm5, %%mm7 nt"
- "paddw %%mm2, %%mm0 nt"
- // test ((pa <= pb)? pa:pb) <= pc
- "pcmpgtw %%mm6, %%mm7 nt" // pab > pc?
- "pxor %%mm1, %%mm1 nt"
- "pand %%mm7, %%mm3 nt"
- "pandn %%mm0, %%mm7 nt"
- "paddw %%mm3, %%mm7 nt"
- "pxor %%mm0, %%mm0 nt"
- "packuswb %%mm1, %%mm7 nt"
- "movq (%%esi,%%ecx,), %%mm3 nt" // load c=Prior(x-bpp)
- "pand _ActiveMask, %%mm7 nt"
- "movq %%mm3, %%mm2 nt" // load b=Prior(x) step 1
- "paddb (%%edi,%%ecx,), %%mm7 nt" // add Paeth predictor with Raw(x)
- "punpcklbw %%mm0, %%mm3 nt" // unpack High bytes of c
- "movq %%mm7, (%%edi,%%ecx,) nt" // write back updated value
- "movq %%mm7, %%mm1 nt" // now mm1 will be used as Raw(x-bpp)
- // do second set of 4 bytes
- "punpckhbw %%mm0, %%mm2 nt" // unpack Low bytes of b
- "punpcklbw %%mm0, %%mm1 nt" // unpack Low bytes of a
- // pav = p - a = (a + b - c) - a = b - c
- "movq %%mm2, %%mm4 nt"
- // pbv = p - b = (a + b - c) - b = a - c
- "movq %%mm1, %%mm5 nt"
- "psubw %%mm3, %%mm4 nt"
- "pxor %%mm7, %%mm7 nt"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
- "movq %%mm4, %%mm6 nt"
- "psubw %%mm3, %%mm5 nt"
- // pa = abs(p-a) = abs(pav)
- // pb = abs(p-b) = abs(pbv)
- // pc = abs(p-c) = abs(pcv)
- "pcmpgtw %%mm4, %%mm0 nt" // create mask pav bytes < 0
- "paddw %%mm5, %%mm6 nt"
- "pand %%mm4, %%mm0 nt" // only pav bytes < 0 in mm7
- "pcmpgtw %%mm5, %%mm7 nt" // create mask pbv bytes < 0
- "psubw %%mm0, %%mm4 nt"
- "pand %%mm5, %%mm7 nt" // only pbv bytes < 0 in mm0
- "psubw %%mm0, %%mm4 nt"
- "psubw %%mm7, %%mm5 nt"
- "pxor %%mm0, %%mm0 nt"
- "pcmpgtw %%mm6, %%mm0 nt" // create mask pcv bytes < 0
- "pand %%mm6, %%mm0 nt" // only pav bytes < 0 in mm7
- "psubw %%mm7, %%mm5 nt"
- "psubw %%mm0, %%mm6 nt"
- // test pa <= pb
- "movq %%mm4, %%mm7 nt"
- "psubw %%mm0, %%mm6 nt"
- "pcmpgtw %%mm5, %%mm7 nt" // pa > pb?
- "movq %%mm7, %%mm0 nt"
- // use mm7 mask to merge pa & pb
- "pand %%mm7, %%mm5 nt"
- // use mm0 mask copy to merge a & b
- "pand %%mm0, %%mm2 nt"
- "pandn %%mm4, %%mm7 nt"
- "pandn %%mm1, %%mm0 nt"
- "paddw %%mm5, %%mm7 nt"
- "paddw %%mm2, %%mm0 nt"
- // test ((pa <= pb)? pa:pb) <= pc
- "pcmpgtw %%mm6, %%mm7 nt" // pab > pc?
- "pxor %%mm1, %%mm1 nt"
- "pand %%mm7, %%mm3 nt"
- "pandn %%mm0, %%mm7 nt"
- "pxor %%mm1, %%mm1 nt"
- "paddw %%mm3, %%mm7 nt"
- "pxor %%mm0, %%mm0 nt"
- // step ecx to next set of 8 bytes and repeat loop til done
- "addl $8, %%ecx nt"
- "packuswb %%mm7, %%mm1 nt"
- "paddb -8(%%edi,%%ecx,), %%mm1 nt" // add predictor with Raw(x)
- "cmpl _MMXLength, %%ecx nt"
- "movq %%mm1, -8(%%edi,%%ecx,) nt" // write back updated value
- // mm1 will be used as Raw(x-bpp) next loop
- "jb paeth_4lp nt"
- : "=S" (dummy_value_S), // output regs (dummy)
- "=D" (dummy_value_D)
- : "0" (prev_row), // esi // input regs
- "1" (row) // edi
- : "%ecx" // clobber list
- #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm2", "%mm3"
- , "%mm4", "%mm5", "%mm6", "%mm7"
- #endif
- );
- }
- break; // end 4 bpp
- case 8: // bpp == 8
- {
- _ActiveMask.use = 0x00000000ffffffffLL;
- __asm__ __volatile__ (
- "movl _dif, %%ecx nt"
- // preload "movl row, %%edi nt"
- // preload "movl prev_row, %%esi nt"
- "pxor %%mm0, %%mm0 nt"
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%ecx,), %%mm1 nt" // only time should need to read
- // a=Raw(x-bpp) bytes
- "paeth_8lp: nt"
- // do first set of 4 bytes
- "movq -8(%%esi,%%ecx,), %%mm3 nt" // read c=Prior(x-bpp) bytes
- "punpcklbw %%mm0, %%mm1 nt" // unpack Low bytes of a
- "movq (%%esi,%%ecx,), %%mm2 nt" // load b=Prior(x)
- "punpcklbw %%mm0, %%mm2 nt" // unpack Low bytes of b
- // pav = p - a = (a + b - c) - a = b - c
- "movq %%mm2, %%mm4 nt"
- "punpcklbw %%mm0, %%mm3 nt" // unpack Low bytes of c
- // pbv = p - b = (a + b - c) - b = a - c
- "movq %%mm1, %%mm5 nt"
- "psubw %%mm3, %%mm4 nt"
- "pxor %%mm7, %%mm7 nt"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
- "movq %%mm4, %%mm6 nt"
- "psubw %%mm3, %%mm5 nt"
- // pa = abs(p-a) = abs(pav)
- // pb = abs(p-b) = abs(pbv)
- // pc = abs(p-c) = abs(pcv)
- "pcmpgtw %%mm4, %%mm0 nt" // create mask pav bytes < 0
- "paddw %%mm5, %%mm6 nt"
- "pand %%mm4, %%mm0 nt" // only pav bytes < 0 in mm7
- "pcmpgtw %%mm5, %%mm7 nt" // create mask pbv bytes < 0
- "psubw %%mm0, %%mm4 nt"
- "pand %%mm5, %%mm7 nt" // only pbv bytes < 0 in mm0
- "psubw %%mm0, %%mm4 nt"
- "psubw %%mm7, %%mm5 nt"
- "pxor %%mm0, %%mm0 nt"
- "pcmpgtw %%mm6, %%mm0 nt" // create mask pcv bytes < 0
- "pand %%mm6, %%mm0 nt" // only pav bytes < 0 in mm7
- "psubw %%mm7, %%mm5 nt"
- "psubw %%mm0, %%mm6 nt"
- // test pa <= pb
- "movq %%mm4, %%mm7 nt"
- "psubw %%mm0, %%mm6 nt"
- "pcmpgtw %%mm5, %%mm7 nt" // pa > pb?
- "movq %%mm7, %%mm0 nt"
- // use mm7 mask to merge pa & pb
- "pand %%mm7, %%mm5 nt"
- // use mm0 mask copy to merge a & b
- "pand %%mm0, %%mm2 nt"
- "pandn %%mm4, %%mm7 nt"
- "pandn %%mm1, %%mm0 nt"
- "paddw %%mm5, %%mm7 nt"
- "paddw %%mm2, %%mm0 nt"
- // test ((pa <= pb)? pa:pb) <= pc
- "pcmpgtw %%mm6, %%mm7 nt" // pab > pc?
- "pxor %%mm1, %%mm1 nt"
- "pand %%mm7, %%mm3 nt"
- "pandn %%mm0, %%mm7 nt"
- "paddw %%mm3, %%mm7 nt"
- "pxor %%mm0, %%mm0 nt"
- "packuswb %%mm1, %%mm7 nt"
- "movq -8(%%esi,%%ecx,), %%mm3 nt" // read c=Prior(x-bpp) bytes
- "pand _ActiveMask, %%mm7 nt"
- "movq (%%esi,%%ecx,), %%mm2 nt" // load b=Prior(x)
- "paddb (%%edi,%%ecx,), %%mm7 nt" // add Paeth predictor with Raw(x)
- "punpckhbw %%mm0, %%mm3 nt" // unpack High bytes of c
- "movq %%mm7, (%%edi,%%ecx,) nt" // write back updated value
- "movq -8(%%edi,%%ecx,), %%mm1 nt" // read a=Raw(x-bpp) bytes
- // do second set of 4 bytes
- "punpckhbw %%mm0, %%mm2 nt" // unpack High bytes of b
- "punpckhbw %%mm0, %%mm1 nt" // unpack High bytes of a
- // pav = p - a = (a + b - c) - a = b - c
- "movq %%mm2, %%mm4 nt"
- // pbv = p - b = (a + b - c) - b = a - c
- "movq %%mm1, %%mm5 nt"
- "psubw %%mm3, %%mm4 nt"
- "pxor %%mm7, %%mm7 nt"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
- "movq %%mm4, %%mm6 nt"
- "psubw %%mm3, %%mm5 nt"
- // pa = abs(p-a) = abs(pav)
- // pb = abs(p-b) = abs(pbv)
- // pc = abs(p-c) = abs(pcv)
- "pcmpgtw %%mm4, %%mm0 nt" // create mask pav bytes < 0
- "paddw %%mm5, %%mm6 nt"
- "pand %%mm4, %%mm0 nt" // only pav bytes < 0 in mm7
- "pcmpgtw %%mm5, %%mm7 nt" // create mask pbv bytes < 0
- "psubw %%mm0, %%mm4 nt"
- "pand %%mm5, %%mm7 nt" // only pbv bytes < 0 in mm0
- "psubw %%mm0, %%mm4 nt"
- "psubw %%mm7, %%mm5 nt"
- "pxor %%mm0, %%mm0 nt"
- "pcmpgtw %%mm6, %%mm0 nt" // create mask pcv bytes < 0
- "pand %%mm6, %%mm0 nt" // only pav bytes < 0 in mm7
- "psubw %%mm7, %%mm5 nt"
- "psubw %%mm0, %%mm6 nt"
- // test pa <= pb
- "movq %%mm4, %%mm7 nt"
- "psubw %%mm0, %%mm6 nt"
- "pcmpgtw %%mm5, %%mm7 nt" // pa > pb?
- "movq %%mm7, %%mm0 nt"
- // use mm7 mask to merge pa & pb
- "pand %%mm7, %%mm5 nt"
- // use mm0 mask copy to merge a & b
- "pand %%mm0, %%mm2 nt"
- "pandn %%mm4, %%mm7 nt"
- "pandn %%mm1, %%mm0 nt"
- "paddw %%mm5, %%mm7 nt"
- "paddw %%mm2, %%mm0 nt"
- // test ((pa <= pb)? pa:pb) <= pc
- "pcmpgtw %%mm6, %%mm7 nt" // pab > pc?
- "pxor %%mm1, %%mm1 nt"
- "pand %%mm7, %%mm3 nt"
- "pandn %%mm0, %%mm7 nt"
- "pxor %%mm1, %%mm1 nt"
- "paddw %%mm3, %%mm7 nt"
- "pxor %%mm0, %%mm0 nt"
- // step ecx to next set of 8 bytes and repeat loop til done
- "addl $8, %%ecx nt"
- "packuswb %%mm7, %%mm1 nt"
- "paddb -8(%%edi,%%ecx,), %%mm1 nt" // add Paeth predictor with Raw(x)
- "cmpl _MMXLength, %%ecx nt"
- "movq %%mm1, -8(%%edi,%%ecx,) nt" // write back updated value
- // mm1 will be used as Raw(x-bpp) next loop
- "jb paeth_8lp nt"
- : "=S" (dummy_value_S), // output regs (dummy)
- "=D" (dummy_value_D)
- : "0" (prev_row), // esi // input regs
- "1" (row) // edi
- : "%ecx" // clobber list
- #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm2", "%mm3"
- , "%mm4", "%mm5", "%mm6", "%mm7"
- #endif
- );
- }
- break; // end 8 bpp
- case 1: // bpp = 1
- case 2: // bpp = 2
- default: // bpp > 8
- {
- __asm__ __volatile__ (
- #ifdef __PIC__
- "pushl %%ebx nt" // save Global Offset Table index
- #endif
- "movl _dif, %%ebx nt"
- "cmpl _FullLength, %%ebx nt"
- "jnb paeth_dend nt"
- // preload "movl row, %%edi nt"
- // preload "movl prev_row, %%esi nt"
- // do Paeth decode for remaining bytes
- "movl %%ebx, %%edx nt"
- // preload "subl bpp, %%edx nt" // (bpp is preloaded into ecx)
- "subl %%ecx, %%edx nt" // edx = ebx - bpp
- "xorl %%ecx, %%ecx nt" // zero ecx before using cl & cx
- "paeth_dlp: nt"
- "xorl %%eax, %%eax nt"
- // pav = p - a = (a + b - c) - a = b - c
- "movb (%%esi,%%ebx,), %%al nt" // load Prior(x) into al
- "movb (%%esi,%%edx,), %%cl nt" // load Prior(x-bpp) into cl
- "subl %%ecx, %%eax nt" // subtract Prior(x-bpp)
- "movl %%eax, _patemp nt" // Save pav for later use
- "xorl %%eax, %%eax nt"
- // pbv = p - b = (a + b - c) - b = a - c
- "movb (%%edi,%%edx,), %%al nt" // load Raw(x-bpp) into al
- "subl %%ecx, %%eax nt" // subtract Prior(x-bpp)
- "movl %%eax, %%ecx nt"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
- "addl _patemp, %%eax nt" // pcv = pav + pbv
- // pc = abs(pcv)
- "testl $0x80000000, %%eax nt"
- "jz paeth_dpca nt"
- "negl %%eax nt" // reverse sign of neg values
- "paeth_dpca: nt"
- "movl %%eax, _pctemp nt" // save pc for later use
- // pb = abs(pbv)
- "testl $0x80000000, %%ecx nt"
- "jz paeth_dpba nt"
- "negl %%ecx nt" // reverse sign of neg values
- "paeth_dpba: nt"
- "movl %%ecx, _pbtemp nt" // save pb for later use
- // pa = abs(pav)
- "movl _patemp, %%eax nt"
- "testl $0x80000000, %%eax nt"
- "jz paeth_dpaa nt"
- "negl %%eax nt" // reverse sign of neg values
- "paeth_dpaa: nt"
- "movl %%eax, _patemp nt" // save pa for later use
- // test if pa <= pb
- "cmpl %%ecx, %%eax nt"
- "jna paeth_dabb nt"
- // pa > pb; now test if pb <= pc
- "cmpl _pctemp, %%ecx nt"
- "jna paeth_dbbc nt"
- // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
- "movb (%%esi,%%edx,), %%cl nt" // load Prior(x-bpp) into cl
- "jmp paeth_dpaeth nt"
- "paeth_dbbc: nt"
- // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
- "movb (%%esi,%%ebx,), %%cl nt" // load Prior(x) into cl
- "jmp paeth_dpaeth nt"
- "paeth_dabb: nt"
- // pa <= pb; now test if pa <= pc
- "cmpl _pctemp, %%eax nt"
- "jna paeth_dabc nt"
- // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
- "movb (%%esi,%%edx,), %%cl nt" // load Prior(x-bpp) into cl
- "jmp paeth_dpaeth nt"
- "paeth_dabc: nt"
- // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
- "movb (%%edi,%%edx,), %%cl nt" // load Raw(x-bpp) into cl
- "paeth_dpaeth: nt"
- "incl %%ebx nt"
- "incl %%edx nt"
- // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
- "addb %%cl, -1(%%edi,%%ebx,) nt"
- "cmpl _FullLength, %%ebx nt"
- "jb paeth_dlp nt"
- "paeth_dend: nt"
- #ifdef __PIC__
- "popl %%ebx nt" // index to Global Offset Table
- #endif
- : "=c" (dummy_value_c), // output regs (dummy)
- "=S" (dummy_value_S),
- "=D" (dummy_value_D)
- : "0" (bpp), // ecx // input regs
- "1" (prev_row), // esi
- "2" (row) // edi
- : "%eax", "%edx" // clobber list
- #ifndef __PIC__
- , "%ebx"
- #endif
- );
- }
- return; // No need to go further with this one
- } // end switch (bpp)
- __asm__ __volatile__ (
- // MMX acceleration complete; now do clean-up
- // check if any remaining bytes left to decode
- #ifdef __PIC__
- "pushl %%ebx nt" // save index to Global Offset Table
- #endif
- "movl _MMXLength, %%ebx nt"
- "cmpl _FullLength, %%ebx nt"
- "jnb paeth_end nt"
- //pre "movl row, %%edi nt"
- //pre "movl prev_row, %%esi nt"
- // do Paeth decode for remaining bytes
- "movl %%ebx, %%edx nt"
- //pre "subl bpp, %%edx nt" // (bpp is preloaded into ecx)
- "subl %%ecx, %%edx nt" // edx = ebx - bpp
- "xorl %%ecx, %%ecx nt" // zero ecx before using cl & cx below
- "paeth_lp2: nt"
- "xorl %%eax, %%eax nt"
- // pav = p - a = (a + b - c) - a = b - c
- "movb (%%esi,%%ebx,), %%al nt" // load Prior(x) into al
- "movb (%%esi,%%edx,), %%cl nt" // load Prior(x-bpp) into cl
- "subl %%ecx, %%eax nt" // subtract Prior(x-bpp)
- "movl %%eax, _patemp nt" // Save pav for later use
- "xorl %%eax, %%eax nt"
- // pbv = p - b = (a + b - c) - b = a - c
- "movb (%%edi,%%edx,), %%al nt" // load Raw(x-bpp) into al
- "subl %%ecx, %%eax nt" // subtract Prior(x-bpp)
- "movl %%eax, %%ecx nt"
- // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
- "addl _patemp, %%eax nt" // pcv = pav + pbv
- // pc = abs(pcv)
- "testl $0x80000000, %%eax nt"
- "jz paeth_pca2 nt"
- "negl %%eax nt" // reverse sign of neg values
- "paeth_pca2: nt"
- "movl %%eax, _pctemp nt" // save pc for later use
- // pb = abs(pbv)
- "testl $0x80000000, %%ecx nt"
- "jz paeth_pba2 nt"
- "negl %%ecx nt" // reverse sign of neg values
- "paeth_pba2: nt"
- "movl %%ecx, _pbtemp nt" // save pb for later use
- // pa = abs(pav)
- "movl _patemp, %%eax nt"
- "testl $0x80000000, %%eax nt"
- "jz paeth_paa2 nt"
- "negl %%eax nt" // reverse sign of neg values
- "paeth_paa2: nt"
- "movl %%eax, _patemp nt" // save pa for later use
- // test if pa <= pb
- "cmpl %%ecx, %%eax nt"
- "jna paeth_abb2 nt"
- // pa > pb; now test if pb <= pc
- "cmpl _pctemp, %%ecx nt"
- "jna paeth_bbc2 nt"
- // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
- "movb (%%esi,%%edx,), %%cl nt" // load Prior(x-bpp) into cl
- "jmp paeth_paeth2 nt"
- "paeth_bbc2: nt"
- // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
- "movb (%%esi,%%ebx,), %%cl nt" // load Prior(x) into cl
- "jmp paeth_paeth2 nt"
- "paeth_abb2: nt"
- // pa <= pb; now test if pa <= pc
- "cmpl _pctemp, %%eax nt"
- "jna paeth_abc2 nt"
- // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
- "movb (%%esi,%%edx,), %%cl nt" // load Prior(x-bpp) into cl
- "jmp paeth_paeth2 nt"
- "paeth_abc2: nt"
- // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
- "movb (%%edi,%%edx,), %%cl nt" // load Raw(x-bpp) into cl
- "paeth_paeth2: nt"
- "incl %%ebx nt"
- "incl %%edx nt"
- // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
- "addb %%cl, -1(%%edi,%%ebx,) nt"
- "cmpl _FullLength, %%ebx nt"
- "jb paeth_lp2 nt"
- "paeth_end: nt"
- "EMMS nt" // end MMX; prep for poss. FP instrs.
- #ifdef __PIC__
- "popl %%ebx nt" // restore index to Global Offset Table
- #endif
- : "=c" (dummy_value_c), // output regs (dummy)
- "=S" (dummy_value_S),
- "=D" (dummy_value_D)
- : "0" (bpp), // ecx // input regs
- "1" (prev_row), // esi
- "2" (row) // edi
- : "%eax", "%edx" // clobber list (no input regs!)
- #ifndef __PIC__
- , "%ebx"
- #endif
- );
- } /* end png_read_filter_row_mmx_paeth() */
- #endif
- #ifdef PNG_THREAD_UNSAFE_OK
- //===========================================================================//
- // //
- // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
- // //
- //===========================================================================//
- // Optimized code for PNG Sub filter decoder
- static void /* PRIVATE */
- png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
- {
- int bpp;
- int dummy_value_a;
- int dummy_value_D;
- bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
- _FullLength = row_info->rowbytes - bpp; // number of bytes to filter
- __asm__ __volatile__ (
- //pre "movl row, %%edi nt"
- "movl %%edi, %%esi nt" // lp = row
- //pre "movl bpp, %%eax nt"
- "addl %%eax, %%edi nt" // rp = row + bpp
- //irr "xorl %%eax, %%eax nt"
- // get # of bytes to alignment
- "movl %%edi, _dif nt" // take start of row
- "addl $0xf, _dif nt" // add 7 + 8 to incr past
- // alignment boundary
- "xorl %%ecx, %%ecx nt"
- "andl $0xfffffff8, _dif nt" // mask to alignment boundary
- "subl %%edi, _dif nt" // subtract from start ==> value
- "jz sub_go nt" // ecx at alignment
- "sub_lp1: nt" // fix alignment
- "movb (%%esi,%%ecx,), %%al nt"
- "addb %%al, (%%edi,%%ecx,) nt"
- "incl %%ecx nt"
- "cmpl _dif, %%ecx nt"
- "jb sub_lp1 nt"
- "sub_go: nt"
- "movl _FullLength, %%eax nt"
- "movl %%eax, %%edx nt"
- "subl %%ecx, %%edx nt" // subtract alignment fix
- "andl $0x00000007, %%edx nt" // calc bytes over mult of 8
- "subl %%edx, %%eax nt" // drop over bytes from length
- "movl %%eax, _MMXLength nt"
- : "=a" (dummy_value_a), // 0 // output regs (dummy)
- "=D" (dummy_value_D) // 1
- : "0" (bpp), // eax // input regs
- "1" (row) // edi
- : "%esi", "%ecx", "%edx" // clobber list
- #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm2", "%mm3"
- , "%mm4", "%mm5", "%mm6", "%mm7"
- #endif
- );
- // now do the math for the rest of the row
- switch (bpp)
- {
- case 3:
- {
- _ActiveMask.use = 0x0000ffffff000000LL;
- _ShiftBpp.use = 24; // == 3 * 8
- _ShiftRem.use = 40; // == 64 - 24
- __asm__ __volatile__ (
- // preload "movl row, %%edi nt"
- "movq _ActiveMask, %%mm7 nt" // load _ActiveMask for 2nd
- // active byte group
- "movl %%edi, %%esi nt" // lp = row
- // preload "movl bpp, %%eax nt"
- "addl %%eax, %%edi nt" // rp = row + bpp
- "movq %%mm7, %%mm6 nt"
- "movl _dif, %%edx nt"
- "psllq _ShiftBpp, %%mm6 nt" // move mask in mm6 to cover
- // 3rd active byte group
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%edx,), %%mm1 nt"
- "sub_3lp: nt" // shift data for adding first
- "psrlq _ShiftRem, %%mm1 nt" // bpp bytes (no need for mask;
- // shift clears inactive bytes)
- // add 1st active group
- "movq (%%edi,%%edx,), %%mm0 nt"
- "paddb %%mm1, %%mm0 nt"
- // add 2nd active group
- "movq %%mm0, %%mm1 nt" // mov updated Raws to mm1
- "psllq _ShiftBpp, %%mm1 nt" // shift data to pos. correctly
- "pand %%mm7, %%mm1 nt" // mask to use 2nd active group
- "paddb %%mm1, %%mm0 nt"
- // add 3rd active group
- "movq %%mm0, %%mm1 nt" // mov updated Raws to mm1
- "psllq _ShiftBpp, %%mm1 nt" // shift data to pos. correctly
- "pand %%mm6, %%mm1 nt" // mask to use 3rd active group
- "addl $8, %%edx nt"
- "paddb %%mm1, %%mm0 nt"
- "cmpl _MMXLength, %%edx nt"
- "movq %%mm0, -8(%%edi,%%edx,) nt" // write updated Raws to array
- "movq %%mm0, %%mm1 nt" // prep 1st add at top of loop
- "jb sub_3lp nt"
- : "=a" (dummy_value_a), // 0 // output regs (dummy)
- "=D" (dummy_value_D) // 1
- : "0" (bpp), // eax // input regs
- "1" (row) // edi
- : "%edx", "%esi" // clobber list
- #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm6", "%mm7"
- #endif
- );
- }
- break;
- case 1:
- {
- __asm__ __volatile__ (
- "movl _dif, %%edx nt"
- // preload "movl row, %%edi nt"
- "cmpl _FullLength, %%edx nt"
- "jnb sub_1end nt"
- "movl %%edi, %%esi nt" // lp = row
- "xorl %%eax, %%eax nt"
- // preload "movl bpp, %%eax nt"
- "addl %%eax, %%edi nt" // rp = row + bpp
- "sub_1lp: nt"
- "movb (%%esi,%%edx,), %%al nt"
- "addb %%al, (%%edi,%%edx,) nt"
- "incl %%edx nt"
- "cmpl _FullLength, %%edx nt"
- "jb sub_1lp nt"
- "sub_1end: nt"
- : "=a" (dummy_value_a), // 0 // output regs (dummy)
- "=D" (dummy_value_D) // 1
- : "0" (bpp), // eax // input regs
- "1" (row) // edi
- : "%edx", "%esi" // clobber list
- );
- }
- return;
- case 6:
- case 4:
- //case 7: // GRR BOGUS
- //case 5: // GRR BOGUS
- {
- _ShiftBpp.use = bpp << 3;
- _ShiftRem.use = 64 - _ShiftBpp.use;
- __asm__ __volatile__ (
- // preload "movl row, %%edi nt"
- "movl _dif, %%edx nt"
- "movl %%edi, %%esi nt" // lp = row
- // preload "movl bpp, %%eax nt"
- "addl %%eax, %%edi nt" // rp = row + bpp
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%edx,), %%mm1 nt"
- "sub_4lp: nt" // shift data for adding first
- "psrlq _ShiftRem, %%mm1 nt" // bpp bytes (no need for mask;
- // shift clears inactive bytes)
- "movq (%%edi,%%edx,), %%mm0 nt"
- "paddb %%mm1, %%mm0 nt"
- // add 2nd active group
- "movq %%mm0, %%mm1 nt" // mov updated Raws to mm1
- "psllq _ShiftBpp, %%mm1 nt" // shift data to pos. correctly
- "addl $8, %%edx nt"
- "paddb %%mm1, %%mm0 nt"
- "cmpl _MMXLength, %%edx nt"
- "movq %%mm0, -8(%%edi,%%edx,) nt"
- "movq %%mm0, %%mm1 nt" // prep 1st add at top of loop
- "jb sub_4lp nt"
- : "=a" (dummy_value_a), // 0 // output regs (dummy)
- "=D" (dummy_value_D) // 1
- : "0" (bpp), // eax // input regs
- "1" (row) // edi
- : "%edx", "%esi" // clobber list
- #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1"
- #endif
- );
- }
- break;
- case 2:
- {
- _ActiveMask.use = 0x00000000ffff0000LL;
- _ShiftBpp.use = 16; // == 2 * 8
- _ShiftRem.use = 48; // == 64 - 16
- __asm__ __volatile__ (
- "movq _ActiveMask, %%mm7 nt" // load _ActiveMask for 2nd
- // active byte group
- "movl _dif, %%edx nt"
- "movq %%mm7, %%mm6 nt"
- // preload "movl row, %%edi nt"
- "psllq _ShiftBpp, %%mm6 nt" // move mask in mm6 to cover
- // 3rd active byte group
- "movl %%edi, %%esi nt" // lp = row
- "movq %%mm6, %%mm5 nt"
- // preload "movl bpp, %%eax nt"
- "addl %%eax, %%edi nt" // rp = row + bpp
- "psllq _ShiftBpp, %%mm5 nt" // move mask in mm5 to cover
- // 4th active byte group
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%edx,), %%mm1 nt"
- "sub_2lp: nt" // shift data for adding first
- "psrlq _ShiftRem, %%mm1 nt" // bpp bytes (no need for mask;
- // shift clears inactive bytes)
- // add 1st active group
- "movq (%%edi,%%edx,), %%mm0 nt"
- "paddb %%mm1, %%mm0 nt"
- // add 2nd active group
- "movq %%mm0, %%mm1 nt" // mov updated Raws to mm1
- "psllq _ShiftBpp, %%mm1 nt" // shift data to pos. correctly
- "pand %%mm7, %%mm1 nt" // mask to use 2nd active group
- "paddb %%mm1, %%mm0 nt"
- // add 3rd active group
- "movq %%mm0, %%mm1 nt" // mov updated Raws to mm1
- "psllq _ShiftBpp, %%mm1 nt" // shift data to pos. correctly
- "pand %%mm6, %%mm1 nt" // mask to use 3rd active group
- "paddb %%mm1, %%mm0 nt"
- // add 4th active group
- "movq %%mm0, %%mm1 nt" // mov updated Raws to mm1
- "psllq _ShiftBpp, %%mm1 nt" // shift data to pos. correctly
- "pand %%mm5, %%mm1 nt" // mask to use 4th active group
- "addl $8, %%edx nt"
- "paddb %%mm1, %%mm0 nt"
- "cmpl _MMXLength, %%edx nt"
- "movq %%mm0, -8(%%edi,%%edx,) nt" // write updated Raws to array
- "movq %%mm0, %%mm1 nt" // prep 1st add at top of loop
- "jb sub_2lp nt"
- : "=a" (dummy_value_a), // 0 // output regs (dummy)
- "=D" (dummy_value_D) // 1
- : "0" (bpp), // eax // input regs
- "1" (row) // edi
- : "%edx", "%esi" // clobber list
- #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
- #endif
- );
- }
- break;
- case 8:
- {
- __asm__ __volatile__ (
- // preload "movl row, %%edi nt"
- "movl _dif, %%edx nt"
- "movl %%edi, %%esi nt" // lp = row
- // preload "movl bpp, %%eax nt"
- "addl %%eax, %%edi nt" // rp = row + bpp
- "movl _MMXLength, %%ecx nt"
- // prime the pump: load the first Raw(x-bpp) data set
- "movq -8(%%edi,%%edx,), %%mm7 nt"
- "andl $0x0000003f, %%ecx nt" // calc bytes over mult of 64
- "sub_8lp: nt"
- "movq (%%edi,%%edx,), %%mm0 nt" // load Sub(x) for 1st 8 bytes
- "paddb %%mm7, %%mm0 nt"
- "movq 8(%%edi,%%edx,), %%mm1 nt" // load Sub(x) for 2nd 8 bytes
- "movq %%mm0, (%%edi,%%edx,) nt" // write Raw(x) for 1st 8 bytes
- // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
- // This will be repeated for each group of 8 bytes with the 8th
- // group being used as the Raw(x-bpp) for the 1st group of the
- // next loop.
- "paddb %%mm0, %%mm1 nt"
- "movq 16(%%edi,%%edx,), %%mm2 nt" // load Sub(x) for 3rd 8 bytes
- "movq %%mm1, 8(%%edi,%%edx,) nt" // write Raw(x) for 2nd 8 bytes
- "paddb %%mm1, %%mm2 nt"
- "movq 24(%%edi,%%edx,), %%mm3 nt" // load Sub(x) for 4th 8 bytes
- "movq %%mm2, 16(%%edi,%%edx,) nt" // write Raw(x) for 3rd 8 bytes
- "paddb %%mm2, %%mm3 nt"
- "movq 32(%%edi,%%edx,), %%mm4 nt" // load Sub(x) for 5th 8 bytes
- "movq %%mm3, 24(%%edi,%%edx,) nt" // write Raw(x) for 4th 8 bytes
- "paddb %%mm3, %%mm4 nt"
- "movq 40(%%edi,%%edx,), %%mm5 nt" // load Sub(x) for 6th 8 bytes
- "movq %%mm4, 32(%%edi,%%edx,) nt" // write Raw(x) for 5th 8 bytes
- "paddb %%mm4, %%mm5 nt"
- "movq 48(%%edi,%%edx,), %%mm6 nt" // load Sub(x) for 7th 8 bytes
- "movq %%mm5, 40(%%edi,%%edx,) nt" // write Raw(x) for 6th 8 bytes
- "paddb %%mm5, %%mm6 nt"
- "movq 56(%%edi,%%edx,), %%mm7 nt" // load Sub(x) for 8th 8 bytes
- "movq %%mm6, 48(%%edi,%%edx,) nt" // write Raw(x) for 7th 8 bytes
- "addl $64, %%edx nt"
- "paddb %%mm6, %%mm7 nt"
- "cmpl %%ecx, %%edx nt"
- "movq %%mm7, -8(%%edi,%%edx,) nt" // write Raw(x) for 8th 8 bytes
- "jb sub_8lp nt"
- "cmpl _MMXLength, %%edx nt"
- "jnb sub_8lt8 nt"
- "sub_8lpA: nt"
- "movq (%%edi,%%edx,), %%mm0 nt"
- "addl $8, %%edx nt"
- "paddb %%mm7, %%mm0 nt"
- "cmpl _MMXLength, %%edx nt"
- "movq %%mm0, -8(%%edi,%%edx,) nt" // -8 to offset early addl edx
- "movq %%mm0, %%mm7 nt" // move calculated Raw(x) data
- // to mm1 to be new Raw(x-bpp)
- // for next loop
- "jb sub_8lpA nt"
- "sub_8lt8: nt"
- : "=a" (dummy_value_a), // 0 // output regs (dummy)
- "=D" (dummy_value_D) // 1
- : "0" (bpp), // eax // input regs
- "1" (row) // edi
- : "%ecx", "%edx", "%esi" // clobber list
- #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
- #endif
- );
- }
- break;
- default: // bpp greater than 8 bytes GRR BOGUS
- {
- __asm__ __volatile__ (
- "movl _dif, %%edx nt"
- // preload "movl row, %%edi nt"
- "movl %%edi, %%esi nt" // lp = row
- // preload "movl bpp, %%eax nt"
- "addl %%eax, %%edi nt" // rp = row + bpp
- "sub_Alp: nt"
- "movq (%%edi,%%edx,), %%mm0 nt"
- "movq (%%esi,%%edx,), %%mm1 nt"
- "addl $8, %%edx nt"
- "paddb %%mm1, %%mm0 nt"
- "cmpl _MMXLength, %%edx nt"
- "movq %%mm0, -8(%%edi,%%edx,) nt" // mov does not affect flags;
- // -8 to offset addl edx
- "jb sub_Alp nt"
- : "=a" (dummy_value_a), // 0 // output regs (dummy)
- "=D" (dummy_value_D) // 1
- : "0" (bpp), // eax // input regs
- "1" (row) // edi
- : "%edx", "%esi" // clobber list
- #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1"
- #endif
- );
- }
- break;
- } // end switch (bpp)
- __asm__ __volatile__ (
- "movl _MMXLength, %%edx nt"
- //pre "movl row, %%edi nt"
- "cmpl _FullLength, %%edx nt"
- "jnb sub_end nt"
- "movl %%edi, %%esi nt" // lp = row
- //pre "movl bpp, %%eax nt"
- "addl %%eax, %%edi nt" // rp = row + bpp
- "xorl %%eax, %%eax nt"
- "sub_lp2: nt"
- "movb (%%esi,%%edx,), %%al nt"
- "addb %%al, (%%edi,%%edx,) nt"
- "incl %%edx nt"
- "cmpl _FullLength, %%edx nt"
- "jb sub_lp2 nt"
- "sub_end: nt"
- "EMMS nt" // end MMX instructions
- : "=a" (dummy_value_a), // 0 // output regs (dummy)
- "=D" (dummy_value_D) // 1
- : "0" (bpp), // eax // input regs
- "1" (row) // edi
- : "%edx", "%esi" // clobber list
- );
- } // end of png_read_filter_row_mmx_sub()
- #endif
- //===========================================================================//
- // //
- // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
- // //
- //===========================================================================//
- // Optimized code for PNG Up filter decoder
- static void /* PRIVATE */
- png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
- png_bytep prev_row)
- {
- png_uint_32 len;
- int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
- int dummy_value_S;
- int dummy_value_D;
- len = row_info->rowbytes; // number of bytes to filter
- __asm__ __volatile__ (
- //pre "movl row, %%edi nt"
- // get # of bytes to alignment
- #ifdef __PIC__
- "pushl %%ebx nt"
- #endif
- "movl %%edi, %%ecx nt"
- "xorl %%ebx, %%ebx nt"
- "addl $0x7, %%ecx nt"
- "xorl %%eax, %%eax nt"
- "andl $0xfffffff8, %%ecx nt"
- //pre "movl prev_row, %%esi nt"
- "subl %%edi, %%ecx nt"
- "jz up_go nt"
- "up_lp1: nt" // fix alignment
- "movb (%%edi,%%ebx,), %%al nt"
- "addb (%%esi,%%ebx,), %%al nt"
- "incl %%ebx nt"
- "cmpl %%ecx, %%ebx nt"
- "movb %%al, -1(%%edi,%%ebx,) nt" // mov does not affect flags; -1 to
- "jb up_lp1 nt" // offset incl ebx
- "up_go: nt"
- //pre "movl len, %%edx nt"
- "movl %%edx, %%ecx nt"
- "subl %%ebx, %%edx nt" // subtract alignment fix
- "andl $0x0000003f, %%edx nt" // calc bytes over mult of 64
- "subl %%edx, %%ecx nt" // drop over bytes from length
- // unrolled loop - use all MMX registers and interleave to reduce
- // number of branch instructions (loops) and reduce partial stalls
- "up_loop: nt"
- "movq (%%esi,%%ebx,), %%mm1 nt"
- "movq (%%edi,%%ebx,), %%mm0 nt"
- "movq 8(%%esi,%%ebx,), %%mm3 nt"
- "paddb %%mm1, %%mm0 nt"
- "movq 8(%%edi,%%ebx,), %%mm2 nt"
- "movq %%mm0, (%%edi,%%ebx,) nt"
- "paddb %%mm3, %%mm2 nt"
- "movq 16(%%esi,%%ebx,), %%mm5 nt"
- "movq %%mm2, 8(%%edi,%%ebx,) nt"
- "movq 16(%%edi,%%ebx,), %%mm4 nt"
- "movq 24(%%esi,%%ebx,), %%mm7 nt"
- "paddb %%mm5, %%mm4 nt"
- "movq 24(%%edi,%%ebx,), %%mm6 nt"
- "movq %%mm4, 16(%%edi,%%ebx,) nt"
- "paddb %%mm7, %%mm6 nt"
- "movq 32(%%esi,%%ebx,), %%mm1 nt"
- "movq %%mm6, 24(%%edi,%%ebx,) nt"
- "movq 32(%%edi,%%ebx,), %%mm0 nt"
- "movq 40(%%esi,%%ebx,), %%mm3 nt"
- "paddb %%mm1, %%mm0 nt"
- "movq 40(%%edi,%%ebx,), %%mm2 nt"
- "movq %%mm0, 32(%%edi,%%ebx,) nt"
- "paddb %%mm3, %%mm2 nt"
- "movq 48(%%esi,%%ebx,), %%mm5 nt"
- "movq %%mm2, 40(%%edi,%%ebx,) nt"
- "movq 48(%%edi,%%ebx,), %%mm4 nt"
- "movq 56(%%esi,%%ebx,), %%mm7 nt"
- "paddb %%mm5, %%mm4 nt"
- "movq 56(%%edi,%%ebx,), %%mm6 nt"
- "movq %%mm4, 48(%%edi,%%ebx,) nt"
- "addl $64, %%ebx nt"
- "paddb %%mm7, %%mm6 nt"
- "cmpl %%ecx, %%ebx nt"
- "movq %%mm6, -8(%%edi,%%ebx,) nt" // (+56)movq does not affect flags;
- "jb up_loop nt" // -8 to offset addl ebx
- "cmpl $0, %%edx nt" // test for bytes over mult of 64
- "jz up_end nt"
- "cmpl $8, %%edx nt" // test for less than 8 bytes
- "jb up_lt8 nt" // [added by lcreeve at netins.net]
- "addl %%edx, %%ecx nt"
- "andl $0x00000007, %%edx nt" // calc bytes over mult of 8
- "subl %%edx, %%ecx nt" // drop over bytes from length
- "jz up_lt8 nt"
- "up_lpA: nt" // use MMX regs to update 8 bytes sim.
- "movq (%%esi,%%ebx,), %%mm1 nt"
- "movq (%%edi,%%ebx,), %%mm0 nt"
- "addl $8, %%ebx nt"
- "paddb %%mm1, %%mm0 nt"
- "cmpl %%ecx, %%ebx nt"
- "movq %%mm0, -8(%%edi,%%ebx,) nt" // movq does not affect flags; -8 to
- "jb up_lpA nt" // offset add ebx
- "cmpl $0, %%edx nt" // test for bytes over mult of 8
- "jz up_end nt"
- "up_lt8: nt"
- "xorl %%eax, %%eax nt"
- "addl %%edx, %%ecx nt" // move over byte count into counter
- "up_lp2: nt" // use x86 regs for remaining bytes
- "movb (%%edi,%%ebx,), %%al nt"
- "addb (%%esi,%%ebx,), %%al nt"
- "incl %%ebx nt"
- "cmpl %%ecx, %%ebx nt"
- "movb %%al, -1(%%edi,%%ebx,) nt" // mov does not affect flags; -1 to
- "jb up_lp2 nt" // offset inc ebx
- "up_end: nt"
- "EMMS nt" // conversion of filtered row complete
- #ifdef __PIC__
- "popl %%ebx nt"
- #endif
- : "=d" (dummy_value_d), // 0 // output regs (dummy)
- "=S" (dummy_value_S), // 1
- "=D" (dummy_value_D) // 2
- : "0" (len), // edx // input regs
- "1" (prev_row), // esi
- "2" (row) // edi
- : "%eax", "%ecx" // clobber list (no input regs!)
- #ifndef __PIC__
- , "%ebx"
- #endif
- #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
- , "%mm0", "%mm1", "%mm2", "%mm3"
- , "%mm4", "%mm5", "%mm6", "%mm7"
- #endif
- );
- } // end of png_read_filter_row_mmx_up()
- #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
- /*===========================================================================*/
- /* */
- /* P N G _ R E A D _ F I L T E R _ R O W */
- /* */
- /*===========================================================================*/
- /* Optimized png_read_filter_row routines */
- void /* PRIVATE */
- png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
- row, png_bytep prev_row, int filter)
- {
- #ifdef PNG_DEBUG
- char filnm[10];
- #endif
- #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
- /* GRR: these are superseded by png_ptr->asm_flags: */
- #define UseMMX_sub 1 // GRR: converted 20000730
- #define UseMMX_up 1 // GRR: converted 20000729
- #define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
- #define UseMMX_paeth 1 // GRR: converted 20000828
- if (_mmx_supported == 2) {
- /* this should have happened in png_init_mmx_flags() already */
- #if !defined(PNG_1_0_X)
- png_warning(png_ptr, "asm_flags may not have been initialized");
- #endif
- png_mmx_support();
- }
- #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
- #ifdef PNG_DEBUG
- png_debug(1, "in png_read_filter_row (pnggccrd.c)n");
- switch (filter)
- {
- case 0: sprintf(filnm, "none");
- break;
- case 1: sprintf(filnm, "sub-%s",
- #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
- #if !defined(PNG_1_0_X)
- (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
- #endif
- #endif
- "x86");
- break;
- case 2: sprintf(filnm, "up-%s",
- #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
- #if !defined(PNG_1_0_X)
- (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
- #endif
- #endif
- "x86");
- break;
- case 3: sprintf(filnm, "avg-%s",
- #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
- #if !defined(PNG_1_0_X)
- (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
- #endif
- #endif
- "x86");
- break;
- case 4: sprintf(filnm, "Paeth-%s",
- #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
- #if !defined(PNG_1_0_X)
- (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
- #endif
- #endif
- "x86");
- break;
- default: sprintf(filnm, "unknw");
- break;
- }
- png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
- png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
- png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
- (int)((row_info->pixel_depth + 7) >> 3));
- png_debug1(0,"rowbytes=%8ldn", row_info->rowbytes);
- #endif /* PNG_DEBUG */
- switch (filter)
- {
- case PNG_FILTER_VALUE_NONE:
- break;
- case PNG_FILTER_VALUE_SUB:
- #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
- #if !defined(PNG_1_0_X)
- if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
- (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
- (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
- #else
- if (_mmx_supported)
- #endif
- {
- png_read_filter_row_mmx_sub(row_info, row);
- }
- else
- #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
- {
- png_uint_32 i;
- png_uint_32 istop = row_info->rowbytes;
- png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
- png_bytep rp = row + bpp;
- png_bytep lp = row;
- for (i = bpp; i < istop; i++)
- {
- *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
- rp++;
- }
- } /* end !UseMMX_sub */
- break;
- case PNG_FILTER_VALUE_UP:
- #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
- #if !defined(PNG_1_0_X)
- if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
- (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
- (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
- #else
- if (_mmx_supported)
- #endif
- {
- png_read_filter_row_mmx_up(row_info, row, prev_row);
- }
- else
- #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
- {
- png_uint_32 i;
- png_uint_32 istop = row_info->rowbytes;
- png_bytep rp = row;
- png_bytep pp = prev_row;
- for (i = 0; i < istop; ++i)
- {
- *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
- rp++;
- }
- } /* end !UseMMX_up */
- break;
- case PNG_FILTER_VALUE_AVG:
- #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
- #if !defined(PNG_1_0_X)
- if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
- (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
- (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
- #else
- if (_mmx_supported)
- #endif
- {
- png_read_filter_row_mmx_avg(row_info, row, prev_row);
- }
- else
- #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
- {
- png_uint_32 i;
- png_bytep rp = row;
- png_bytep pp = prev_row;
- png_bytep lp = row;
- png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
- png_uint_32 istop = row_info->rowbytes - bpp;
- for (i = 0; i < bpp; i++)
- {
- *rp = (png_byte)(((int)(*rp) +
- ((int)(*pp++) >> 1)) & 0xff);
- rp++;
- }
- for (i = 0; i < istop; i++)
- {
- *rp = (png_byte)(((int)(*rp) +
- ((int)(*pp++ + *lp++) >> 1)) & 0xff);
- rp++;
- }
- } /* end !UseMMX_avg */
- break;
- case PNG_FILTER_VALUE_PAETH:
- #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
- #if !defined(PNG_1_0_X)
- if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
- (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
- (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
- #else
- if (_mmx_supported)
- #endif
- {
- png_read_filter_row_mmx_paeth(row_info, row, prev_row);
- }
- else
- #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
- {
- png_uint_32 i;
- png_bytep rp = row;
- png_bytep pp = prev_row;
- png_bytep lp = row;
- png_bytep cp = prev_row;
- png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
- png_uint_32 istop = row_info->rowbytes - bpp;
- for (i = 0; i < bpp; i++)
- {
- *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
- rp++;
- }
- for (i = 0; i < istop; i++) /* use leftover rp,pp */
- {
- int a, b, c, pa, pb, pc, p;
- a = *lp++;
- b = *pp++;
- c = *cp++;
- p = b - c;
- pc = a - c;
- #ifdef PNG_USE_ABS
- pa = abs(p);
- pb = abs(pc);
- pc = abs(p + pc);
- #else
- pa = p < 0 ? -p : p;
- pb = pc < 0 ? -pc : pc;
- pc = (p + pc) < 0 ? -(p + pc) : p + pc;
- #endif
- /*
- if (pa <= pb && pa <= pc)
- p = a;
- else if (pb <= pc)
- p = b;
- else
- p = c;
- */
- p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
- *rp = (png_byte)(((int)(*rp) + p) & 0xff);
- rp++;
- }
- } /* end !UseMMX_paeth */
- break;
- default:
- png_warning(png_ptr, "Ignoring bad row-filter type");
- *row=0;
- break;
- }
- }
- #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
- /*===========================================================================*/
- /* */
- /* P N G _ M M X _ S U P P O R T */
- /* */
- /*===========================================================================*/
- /* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
- * (2) all instructions compile with gcc 2.7.2.3 and later
- * (3) the function is moved down here to prevent gcc from
- * inlining it in multiple places and then barfing be-
- * cause the ".NOT_SUPPORTED" label is multiply defined
- * [is there a way to signal that a *single* function should
- * not be inlined? is there a way to modify the label for
- * each inlined instance, e.g., by appending _1, _2, etc.?
- * maybe if don't use leading "." in label name? (nope...sigh)]
- */
- int PNGAPI
- png_mmx_support(void)
- {
- #if defined(PNG_MMX_CODE_SUPPORTED)
- int result;
- __asm__ __volatile__ (
- "pushl %%ebx nt" // ebx gets clobbered by CPUID instruction
- "pushl %%ecx nt" // so does ecx...
- "pushl %%edx nt" // ...and edx (but ecx & edx safe on Linux)
- // ".byte 0x66 nt" // convert 16-bit pushf to 32-bit pushfd
- // "pushf nt" // 16-bit pushf
- "pushfl nt" // save Eflag to stack
- "popl %%eax nt" // get Eflag from stack into eax
- "movl %%eax, %%ecx nt" // make another copy of Eflag in ecx
- "xorl $0x200000, %%eax nt" // toggle ID bit in Eflag (i.e., bit 21)
- "pushl %%eax nt" // save modified Eflag back to stack
- // ".byte 0x66 nt" // convert 16-bit popf to 32-bit popfd
- // "popf nt" // 16-bit popf
- "popfl nt" // restore modified value to Eflag reg
- "pushfl nt" // save Eflag to stack
- "popl %%eax nt" // get Eflag from stack
- "pushl %%ecx nt" // save original Eflag to stack
- "popfl nt" // restore original Eflag
- "xorl %%ecx, %%eax nt" // compare new Eflag with original Eflag
- "jz 0f nt" // if same, CPUID instr. is not supported
- "xorl %%eax, %%eax nt" // set eax to zero
- // ".byte 0x0f, 0xa2 nt" // CPUID instruction (two-byte opcode)
- "cpuid nt" // get the CPU identification info
- "cmpl $1, %%eax nt" // make sure eax return non-zero value
- "jl 0f nt" // if eax is zero, MMX is not supported
- "xorl %%eax, %%eax nt" // set eax to zero and...
- "incl %%eax nt" // ...increment eax to 1. This pair is
- // faster than the instruction "mov eax, 1"
- "cpuid nt" // get the CPU identification info again
- "andl $0x800000, %%edx nt" // mask out all bits but MMX bit (23)
- "cmpl $0, %%edx nt" // 0 = MMX not supported
- "jz 0f nt" // non-zero = yes, MMX IS supported
- "movl $1, %%eax nt" // set return value to 1
- "jmp 1f nt" // DONE: have MMX support
- "0: nt" // .NOT_SUPPORTED: target label for jump instructions
- "movl $0, %%eax nt" // set return value to 0
- "1: nt" // .RETURN: target label for jump instructions
- "popl %%edx nt" // restore edx
- "popl %%ecx nt" // restore ecx
- "popl %%ebx nt" // restore ebx
- // "ret nt" // DONE: no MMX support
- // (fall through to standard C "ret")
- : "=a" (result) // output list
- : // any variables used on input (none)
- // no clobber list
- // , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
- // , "memory" // if write to a variable gcc thought was in a reg
- // , "cc" // "condition codes" (flag bits)
- );
- _mmx_supported = result;
- #else
- _mmx_supported = 0;
- #endif /* PNG_MMX_CODE_SUPPORTED */
- return _mmx_supported;
- }
- #endif /* PNG_USE_PNGGCCRD */