pngvcrd.c
上传用户:szled88
上传日期:2015-04-09
资源大小:43957k
文件大小:144k
源码类别:

对话框与窗口

开发平台:

Visual C++

  1. /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
  2.  *
  3.  * For Intel x86 CPU and Microsoft Visual C++ compiler
  4.  *
  5.  * Last changed in libpng 1.2.6 - August 15, 2004
  6.  * For conditions of distribution and use, see copyright notice in png.h
  7.  * Copyright (c) 1998-2004 Glenn Randers-Pehrson
  8.  * Copyright (c) 1998, Intel Corporation
  9.  *
  10.  * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
  11.  * Interface to libpng contributed by Gilles Vollant, 1999
  12.  *
  13.  *
  14.  * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
  15.  * a sign error in the post-MMX cleanup code for each pixel_depth resulted
  16.  * in bad pixels at the beginning of some rows of some images, and also
  17.  * (due to out-of-range memory reads and writes) caused heap corruption
  18.  * when compiled with MSVC 6.0.  The error was fixed in version 1.0.4e.
  19.  *
  20.  * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
  21.  *
  22.  * [runtime MMX configuration, GRR 20010102]
  23.  *
  24.  */
  25. #define PNG_INTERNAL
  26. #include "png.h"
  27. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
  28. static int mmx_supported=2;
  29. int PNGAPI
  30. png_mmx_support(void)
  31. {
  32.   int mmx_supported_local = 0;
  33.   _asm {
  34.     push ebx          //CPUID will trash these
  35.     push ecx
  36.     push edx
  37.     pushfd            //Save Eflag to stack
  38.     pop eax           //Get Eflag from stack into eax
  39.     mov ecx, eax      //Make another copy of Eflag in ecx
  40.     xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
  41.     push eax          //Save modified Eflag back to stack
  42.     popfd             //Restored modified value back to Eflag reg
  43.     pushfd            //Save Eflag to stack
  44.     pop eax           //Get Eflag from stack
  45.     push ecx          // save original Eflag to stack
  46.     popfd             // restore original Eflag
  47.     xor eax, ecx      //Compare the new Eflag with the original Eflag
  48.     jz NOT_SUPPORTED  //If the same, CPUID instruction is not supported,
  49.                       //skip following instructions and jump to
  50.                       //NOT_SUPPORTED label
  51.     xor eax, eax      //Set eax to zero
  52.     _asm _emit 0x0f   //CPUID instruction  (two bytes opcode)
  53.     _asm _emit 0xa2
  54.     cmp eax, 1        //make sure eax return non-zero value
  55.     jl NOT_SUPPORTED  //If eax is zero, mmx not supported
  56.     xor eax, eax      //set eax to zero
  57.     inc eax           //Now increment eax to 1.  This instruction is
  58.                       //faster than the instruction "mov eax, 1"
  59.     _asm _emit 0x0f   //CPUID instruction
  60.     _asm _emit 0xa2
  61.     and edx, 0x00800000  //mask out all bits but mmx bit(24)
  62.     cmp edx, 0        // 0 = mmx not supported
  63.     jz  NOT_SUPPORTED // non-zero = Yes, mmx IS supported
  64.     mov  mmx_supported_local, 1  //set return value to 1
  65. NOT_SUPPORTED:
  66.     mov  eax, mmx_supported_local  //move return value to eax
  67.     pop edx          //CPUID trashed these
  68.     pop ecx
  69.     pop ebx
  70.   }
  71.   //mmx_supported_local=0; // test code for force don't support MMX
  72.   //printf("MMX : %u (1=MMX supported)n",mmx_supported_local);
  73.   mmx_supported = mmx_supported_local;
  74.   return mmx_supported_local;
  75. }
  76. /* Combines the row recently read in with the previous row.
  77.    This routine takes care of alpha and transparency if requested.
  78.    This routine also handles the two methods of progressive display
  79.    of interlaced images, depending on the mask value.
  80.    The mask value describes which pixels are to be combined with
  81.    the row.  The pattern always repeats every 8 pixels, so just 8
  82.    bits are needed.  A one indicates the pixel is to be combined; a
  83.    zero indicates the pixel is to be skipped.  This is in addition
  84.    to any alpha or transparency value associated with the pixel.  If
  85.    you want all pixels to be combined, pass 0xff (255) in mask.  */
  86. /* Use this routine for x86 platform - uses faster MMX routine if machine
  87.    supports MMX */
  88. void /* PRIVATE */
  89. png_combine_row(png_structp png_ptr, png_bytep row, int mask)
  90. {
  91. #ifdef PNG_USE_LOCAL_ARRAYS
  92.    const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
  93. #endif
  94.    png_debug(1,"in png_combine_row_asmn");
  95.    if (mmx_supported == 2) {
  96. #if !defined(PNG_1_0_X)
  97.        /* this should have happened in png_init_mmx_flags() already */
  98.        png_warning(png_ptr, "asm_flags may not have been initialized");
  99. #endif
  100.        png_mmx_support();
  101.    }
  102.    if (mask == 0xff)
  103.    {
  104.       png_memcpy(row, png_ptr->row_buf + 1,
  105.        (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
  106.        png_ptr->width));
  107.    }
  108.    /* GRR:  add "else if (mask == 0)" case?
  109.     *       or does png_combine_row() not even get called in that case? */
  110.    else
  111.    {
  112.       switch (png_ptr->row_info.pixel_depth)
  113.       {
  114.          case 1:
  115.          {
  116.             png_bytep sp;
  117.             png_bytep dp;
  118.             int s_inc, s_start, s_end;
  119.             int m;
  120.             int shift;
  121.             png_uint_32 i;
  122.             sp = png_ptr->row_buf + 1;
  123.             dp = row;
  124.             m = 0x80;
  125. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  126.             if (png_ptr->transformations & PNG_PACKSWAP)
  127.             {
  128.                 s_start = 0;
  129.                 s_end = 7;
  130.                 s_inc = 1;
  131.             }
  132.             else
  133. #endif
  134.             {
  135.                 s_start = 7;
  136.                 s_end = 0;
  137.                 s_inc = -1;
  138.             }
  139.             shift = s_start;
  140.             for (i = 0; i < png_ptr->width; i++)
  141.             {
  142.                if (m & mask)
  143.                {
  144.                   int value;
  145.                   value = (*sp >> shift) & 0x1;
  146.                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
  147.                   *dp |= (png_byte)(value << shift);
  148.                }
  149.                if (shift == s_end)
  150.                {
  151.                   shift = s_start;
  152.                   sp++;
  153.                   dp++;
  154.                }
  155.                else
  156.                   shift += s_inc;
  157.                if (m == 1)
  158.                   m = 0x80;
  159.                else
  160.                   m >>= 1;
  161.             }
  162.             break;
  163.          }
  164.          case 2:
  165.          {
  166.             png_bytep sp;
  167.             png_bytep dp;
  168.             int s_start, s_end, s_inc;
  169.             int m;
  170.             int shift;
  171.             png_uint_32 i;
  172.             int value;
  173.             sp = png_ptr->row_buf + 1;
  174.             dp = row;
  175.             m = 0x80;
  176. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  177.             if (png_ptr->transformations & PNG_PACKSWAP)
  178.             {
  179.                s_start = 0;
  180.                s_end = 6;
  181.                s_inc = 2;
  182.             }
  183.             else
  184. #endif
  185.             {
  186.                s_start = 6;
  187.                s_end = 0;
  188.                s_inc = -2;
  189.             }
  190.             shift = s_start;
  191.             for (i = 0; i < png_ptr->width; i++)
  192.             {
  193.                if (m & mask)
  194.                {
  195.                   value = (*sp >> shift) & 0x3;
  196.                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
  197.                   *dp |= (png_byte)(value << shift);
  198.                }
  199.                if (shift == s_end)
  200.                {
  201.                   shift = s_start;
  202.                   sp++;
  203.                   dp++;
  204.                }
  205.                else
  206.                   shift += s_inc;
  207.                if (m == 1)
  208.                   m = 0x80;
  209.                else
  210.                   m >>= 1;
  211.             }
  212.             break;
  213.          }
  214.          case 4:
  215.          {
  216.             png_bytep sp;
  217.             png_bytep dp;
  218.             int s_start, s_end, s_inc;
  219.             int m;
  220.             int shift;
  221.             png_uint_32 i;
  222.             int value;
  223.             sp = png_ptr->row_buf + 1;
  224.             dp = row;
  225.             m = 0x80;
  226. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  227.             if (png_ptr->transformations & PNG_PACKSWAP)
  228.             {
  229.                s_start = 0;
  230.                s_end = 4;
  231.                s_inc = 4;
  232.             }
  233.             else
  234. #endif
  235.             {
  236.                s_start = 4;
  237.                s_end = 0;
  238.                s_inc = -4;
  239.             }
  240.             shift = s_start;
  241.             for (i = 0; i < png_ptr->width; i++)
  242.             {
  243.                if (m & mask)
  244.                {
  245.                   value = (*sp >> shift) & 0xf;
  246.                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
  247.                   *dp |= (png_byte)(value << shift);
  248.                }
  249.                if (shift == s_end)
  250.                {
  251.                   shift = s_start;
  252.                   sp++;
  253.                   dp++;
  254.                }
  255.                else
  256.                   shift += s_inc;
  257.                if (m == 1)
  258.                   m = 0x80;
  259.                else
  260.                   m >>= 1;
  261.             }
  262.             break;
  263.          }
  264.          case 8:
  265.          {
  266.             png_bytep srcptr;
  267.             png_bytep dstptr;
  268.             png_uint_32 len;
  269.             int m;
  270.             int diff, unmask;
  271.             __int64 mask0=0x0102040810204080;
  272. #if !defined(PNG_1_0_X)
  273.             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  274.                 /* && mmx_supported */ )
  275. #else
  276.             if (mmx_supported)
  277. #endif
  278.             {
  279.                srcptr = png_ptr->row_buf + 1;
  280.                dstptr = row;
  281.                m = 0x80;
  282.                unmask = ~mask;
  283.                len  = png_ptr->width &~7;  //reduce to multiple of 8
  284.                diff = png_ptr->width & 7;  //amount lost
  285.                _asm
  286.                {
  287.                   movd       mm7, unmask   //load bit pattern
  288.                   psubb      mm6,mm6       //zero mm6
  289.                   punpcklbw  mm7,mm7
  290.                   punpcklwd  mm7,mm7
  291.                   punpckldq  mm7,mm7       //fill register with 8 masks
  292.                   movq       mm0,mask0
  293.                   pand       mm0,mm7       //nonzero if keep byte
  294.                   pcmpeqb    mm0,mm6       //zeros->1s, v versa
  295.                   mov        ecx,len       //load length of line (pixels)
  296.                   mov        esi,srcptr    //load source
  297.                   mov        ebx,dstptr    //load dest
  298.                   cmp        ecx,0         //lcr
  299.                   je         mainloop8end
  300. mainloop8:
  301.                   movq       mm4,[esi]
  302.                   pand       mm4,mm0
  303.                   movq       mm6,mm0
  304.                   pandn      mm6,[ebx]
  305.                   por        mm4,mm6
  306.                   movq       [ebx],mm4
  307.                   add        esi,8         //inc by 8 bytes processed
  308.                   add        ebx,8
  309.                   sub        ecx,8         //dec by 8 pixels processed
  310.                   ja         mainloop8
  311. mainloop8end:
  312.                   mov        ecx,diff
  313.                   cmp        ecx,0
  314.                   jz         end8
  315.                   mov        edx,mask
  316.                   sal        edx,24        //make low byte the high byte
  317. secondloop8:
  318.                   sal        edx,1         //move high bit to CF
  319.                   jnc        skip8         //if CF = 0
  320.                   mov        al,[esi]
  321.                   mov        [ebx],al
  322. skip8:
  323.                   inc        esi
  324.                   inc        ebx
  325.                   dec        ecx
  326.                   jnz        secondloop8
  327. end8:
  328.                   emms
  329.                }
  330.             }
  331.             else /* mmx not supported - use modified C routine */
  332.             {
  333.                register unsigned int incr1, initial_val, final_val;
  334.                png_size_t pixel_bytes;
  335.                png_uint_32 i;
  336.                register int disp = png_pass_inc[png_ptr->pass];
  337.                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  338.                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  339.                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  340.                   pixel_bytes;
  341.                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  342.                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  343.                final_val = png_ptr->width*pixel_bytes;
  344.                incr1 = (disp)*pixel_bytes;
  345.                for (i = initial_val; i < final_val; i += incr1)
  346.                {
  347.                   png_memcpy(dstptr, srcptr, pixel_bytes);
  348.                   srcptr += incr1;
  349.                   dstptr += incr1;
  350.                }
  351.             } /* end of else */
  352.             break;
  353.          }       // end 8 bpp
  354.          case 16:
  355.          {
  356.             png_bytep srcptr;
  357.             png_bytep dstptr;
  358.             png_uint_32 len;
  359.             int unmask, diff;
  360.             __int64 mask1=0x0101020204040808,
  361.                     mask0=0x1010202040408080;
  362. #if !defined(PNG_1_0_X)
  363.             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  364.                 /* && mmx_supported */ )
  365. #else
  366.             if (mmx_supported)
  367. #endif
  368.             {
  369.                srcptr = png_ptr->row_buf + 1;
  370.                dstptr = row;
  371.                unmask = ~mask;
  372.                len     = (png_ptr->width)&~7;
  373.                diff = (png_ptr->width)&7;
  374.                _asm
  375.                {
  376.                   movd       mm7, unmask       //load bit pattern
  377.                   psubb      mm6,mm6           //zero mm6
  378.                   punpcklbw  mm7,mm7
  379.                   punpcklwd  mm7,mm7
  380.                   punpckldq  mm7,mm7           //fill register with 8 masks
  381.                   movq       mm0,mask0
  382.                   movq       mm1,mask1
  383.                   pand       mm0,mm7
  384.                   pand       mm1,mm7
  385.                   pcmpeqb    mm0,mm6
  386.                   pcmpeqb    mm1,mm6
  387.                   mov        ecx,len           //load length of line
  388.                   mov        esi,srcptr        //load source
  389.                   mov        ebx,dstptr        //load dest
  390.                   cmp        ecx,0             //lcr
  391.                   jz         mainloop16end
  392. mainloop16:
  393.                   movq       mm4,[esi]
  394.                   pand       mm4,mm0
  395.                   movq       mm6,mm0
  396.                   movq       mm7,[ebx]
  397.                   pandn      mm6,mm7
  398.                   por        mm4,mm6
  399.                   movq       [ebx],mm4
  400.                   movq       mm5,[esi+8]
  401.                   pand       mm5,mm1
  402.                   movq       mm7,mm1
  403.                   movq       mm6,[ebx+8]
  404.                   pandn      mm7,mm6
  405.                   por        mm5,mm7
  406.                   movq       [ebx+8],mm5
  407.                   add        esi,16            //inc by 16 bytes processed
  408.                   add        ebx,16
  409.                   sub        ecx,8             //dec by 8 pixels processed
  410.                   ja         mainloop16
  411. mainloop16end:
  412.                   mov        ecx,diff
  413.                   cmp        ecx,0
  414.                   jz         end16
  415.                   mov        edx,mask
  416.                   sal        edx,24            //make low byte the high byte
  417. secondloop16:
  418.                   sal        edx,1             //move high bit to CF
  419.                   jnc        skip16            //if CF = 0
  420.                   mov        ax,[esi]
  421.                   mov        [ebx],ax
  422. skip16:
  423.                   add        esi,2
  424.                   add        ebx,2
  425.                   dec        ecx
  426.                   jnz        secondloop16
  427. end16:
  428.                   emms
  429.                }
  430.             }
  431.             else /* mmx not supported - use modified C routine */
  432.             {
  433.                register unsigned int incr1, initial_val, final_val;
  434.                png_size_t pixel_bytes;
  435.                png_uint_32 i;
  436.                register int disp = png_pass_inc[png_ptr->pass];
  437.                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  438.                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  439.                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  440.                   pixel_bytes;
  441.                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  442.                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  443.                final_val = png_ptr->width*pixel_bytes;
  444.                incr1 = (disp)*pixel_bytes;
  445.                for (i = initial_val; i < final_val; i += incr1)
  446.                {
  447.                   png_memcpy(dstptr, srcptr, pixel_bytes);
  448.                   srcptr += incr1;
  449.                   dstptr += incr1;
  450.                }
  451.             } /* end of else */
  452.             break;
  453.          }       // end 16 bpp
  454.          case 24:
  455.          {
  456.             png_bytep srcptr;
  457.             png_bytep dstptr;
  458.             png_uint_32 len;
  459.             int unmask, diff;
  460.             __int64 mask2=0x0101010202020404,  //24bpp
  461.                     mask1=0x0408080810101020,
  462.                     mask0=0x2020404040808080;
  463.             srcptr = png_ptr->row_buf + 1;
  464.             dstptr = row;
  465.             unmask = ~mask;
  466.             len     = (png_ptr->width)&~7;
  467.             diff = (png_ptr->width)&7;
  468. #if !defined(PNG_1_0_X)
  469.             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  470.                 /* && mmx_supported */ )
  471. #else
  472.             if (mmx_supported)
  473. #endif
  474.             {
  475.                _asm
  476.                {
  477.                   movd       mm7, unmask       //load bit pattern
  478.                   psubb      mm6,mm6           //zero mm6
  479.                   punpcklbw  mm7,mm7
  480.                   punpcklwd  mm7,mm7
  481.                   punpckldq  mm7,mm7           //fill register with 8 masks
  482.                   movq       mm0,mask0
  483.                   movq       mm1,mask1
  484.                   movq       mm2,mask2
  485.                   pand       mm0,mm7
  486.                   pand       mm1,mm7
  487.                   pand       mm2,mm7
  488.                   pcmpeqb    mm0,mm6
  489.                   pcmpeqb    mm1,mm6
  490.                   pcmpeqb    mm2,mm6
  491.                   mov        ecx,len           //load length of line
  492.                   mov        esi,srcptr        //load source
  493.                   mov        ebx,dstptr        //load dest
  494.                   cmp        ecx,0
  495.                   jz         mainloop24end
  496. mainloop24:
  497.                   movq       mm4,[esi]
  498.                   pand       mm4,mm0
  499.                   movq       mm6,mm0
  500.                   movq       mm7,[ebx]
  501.                   pandn      mm6,mm7
  502.                   por        mm4,mm6
  503.                   movq       [ebx],mm4
  504.                   movq       mm5,[esi+8]
  505.                   pand       mm5,mm1
  506.                   movq       mm7,mm1
  507.                   movq       mm6,[ebx+8]
  508.                   pandn      mm7,mm6
  509.                   por        mm5,mm7
  510.                   movq       [ebx+8],mm5
  511.                   movq       mm6,[esi+16]
  512.                   pand       mm6,mm2
  513.                   movq       mm4,mm2
  514.                   movq       mm7,[ebx+16]
  515.                   pandn      mm4,mm7
  516.                   por        mm6,mm4
  517.                   movq       [ebx+16],mm6
  518.                   add        esi,24            //inc by 24 bytes processed
  519.                   add        ebx,24
  520.                   sub        ecx,8             //dec by 8 pixels processed
  521.                   ja         mainloop24
  522. mainloop24end:
  523.                   mov        ecx,diff
  524.                   cmp        ecx,0
  525.                   jz         end24
  526.                   mov        edx,mask
  527.                   sal        edx,24            //make low byte the high byte
  528. secondloop24:
  529.                   sal        edx,1             //move high bit to CF
  530.                   jnc        skip24            //if CF = 0
  531.                   mov        ax,[esi]
  532.                   mov        [ebx],ax
  533.                   xor        eax,eax
  534.                   mov        al,[esi+2]
  535.                   mov        [ebx+2],al
  536. skip24:
  537.                   add        esi,3
  538.                   add        ebx,3
  539.                   dec        ecx
  540.                   jnz        secondloop24
  541. end24:
  542.                   emms
  543.                }
  544.             }
  545.             else /* mmx not supported - use modified C routine */
  546.             {
  547.                register unsigned int incr1, initial_val, final_val;
  548.                png_size_t pixel_bytes;
  549.                png_uint_32 i;
  550.                register int disp = png_pass_inc[png_ptr->pass];
  551.                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  552.                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  553.                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  554.                   pixel_bytes;
  555.                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  556.                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  557.                final_val = png_ptr->width*pixel_bytes;
  558.                incr1 = (disp)*pixel_bytes;
  559.                for (i = initial_val; i < final_val; i += incr1)
  560.                {
  561.                   png_memcpy(dstptr, srcptr, pixel_bytes);
  562.                   srcptr += incr1;
  563.                   dstptr += incr1;
  564.                }
  565.             } /* end of else */
  566.             break;
  567.          }       // end 24 bpp
  568.          case 32:
  569.          {
  570.             png_bytep srcptr;
  571.             png_bytep dstptr;
  572.             png_uint_32 len;
  573.             int unmask, diff;
  574.             __int64 mask3=0x0101010102020202,  //32bpp
  575.                     mask2=0x0404040408080808,
  576.                     mask1=0x1010101020202020,
  577.                     mask0=0x4040404080808080;
  578.             srcptr = png_ptr->row_buf + 1;
  579.             dstptr = row;
  580.             unmask = ~mask;
  581.             len     = (png_ptr->width)&~7;
  582.             diff = (png_ptr->width)&7;
  583. #if !defined(PNG_1_0_X)
  584.             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  585.                 /* && mmx_supported */ )
  586. #else
  587.             if (mmx_supported)
  588. #endif
  589.             {
  590.                _asm
  591.                {
  592.                   movd       mm7, unmask       //load bit pattern
  593.                   psubb      mm6,mm6           //zero mm6
  594.                   punpcklbw  mm7,mm7
  595.                   punpcklwd  mm7,mm7
  596.                   punpckldq  mm7,mm7           //fill register with 8 masks
  597.                   movq       mm0,mask0
  598.                   movq       mm1,mask1
  599.                   movq       mm2,mask2
  600.                   movq       mm3,mask3
  601.                   pand       mm0,mm7
  602.                   pand       mm1,mm7
  603.                   pand       mm2,mm7
  604.                   pand       mm3,mm7
  605.                   pcmpeqb    mm0,mm6
  606.                   pcmpeqb    mm1,mm6
  607.                   pcmpeqb    mm2,mm6
  608.                   pcmpeqb    mm3,mm6
  609.                   mov        ecx,len           //load length of line
  610.                   mov        esi,srcptr        //load source
  611.                   mov        ebx,dstptr        //load dest
  612.                   cmp        ecx,0             //lcr
  613.                   jz         mainloop32end
  614. mainloop32:
  615.                   movq       mm4,[esi]
  616.                   pand       mm4,mm0
  617.                   movq       mm6,mm0
  618.                   movq       mm7,[ebx]
  619.                   pandn      mm6,mm7
  620.                   por        mm4,mm6
  621.                   movq       [ebx],mm4
  622.                   movq       mm5,[esi+8]
  623.                   pand       mm5,mm1
  624.                   movq       mm7,mm1
  625.                   movq       mm6,[ebx+8]
  626.                   pandn      mm7,mm6
  627.                   por        mm5,mm7
  628.                   movq       [ebx+8],mm5
  629.                   movq       mm6,[esi+16]
  630.                   pand       mm6,mm2
  631.                   movq       mm4,mm2
  632.                   movq       mm7,[ebx+16]
  633.                   pandn      mm4,mm7
  634.                   por        mm6,mm4
  635.                   movq       [ebx+16],mm6
  636.                   movq       mm7,[esi+24]
  637.                   pand       mm7,mm3
  638.                   movq       mm5,mm3
  639.                   movq       mm4,[ebx+24]
  640.                   pandn      mm5,mm4
  641.                   por        mm7,mm5
  642.                   movq       [ebx+24],mm7
  643.                   add        esi,32            //inc by 32 bytes processed
  644.                   add        ebx,32
  645.                   sub        ecx,8             //dec by 8 pixels processed
  646.                   ja         mainloop32
  647. mainloop32end:
  648.                   mov        ecx,diff
  649.                   cmp        ecx,0
  650.                   jz         end32
  651.                   mov        edx,mask
  652.                   sal        edx,24            //make low byte the high byte
  653. secondloop32:
  654.                   sal        edx,1             //move high bit to CF
  655.                   jnc        skip32            //if CF = 0
  656.                   mov        eax,[esi]
  657.                   mov        [ebx],eax
  658. skip32:
  659.                   add        esi,4
  660.                   add        ebx,4
  661.                   dec        ecx
  662.                   jnz        secondloop32
  663. end32:
  664.                   emms
  665.                }
  666.             }
  667.             else /* mmx _not supported - Use modified C routine */
  668.             {
  669.                register unsigned int incr1, initial_val, final_val;
  670.                png_size_t pixel_bytes;
  671.                png_uint_32 i;
  672.                register int disp = png_pass_inc[png_ptr->pass];
  673.                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  674.                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  675.                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  676.                   pixel_bytes;
  677.                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  678.                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  679.                final_val = png_ptr->width*pixel_bytes;
  680.                incr1 = (disp)*pixel_bytes;
  681.                for (i = initial_val; i < final_val; i += incr1)
  682.                {
  683.                   png_memcpy(dstptr, srcptr, pixel_bytes);
  684.                   srcptr += incr1;
  685.                   dstptr += incr1;
  686.                }
  687.             } /* end of else */
  688.             break;
  689.          }       // end 32 bpp
  690.          case 48:
  691.          {
  692.             png_bytep srcptr;
  693.             png_bytep dstptr;
  694.             png_uint_32 len;
  695.             int unmask, diff;
  696.             __int64 mask5=0x0101010101010202,
  697.                     mask4=0x0202020204040404,
  698.                     mask3=0x0404080808080808,
  699.                     mask2=0x1010101010102020,
  700.                     mask1=0x2020202040404040,
  701.                     mask0=0x4040808080808080;
  702. #if !defined(PNG_1_0_X)
  703.             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  704.                 /* && mmx_supported */ )
  705. #else
  706.             if (mmx_supported)
  707. #endif
  708.             {
  709.                srcptr = png_ptr->row_buf + 1;
  710.                dstptr = row;
  711.                unmask = ~mask;
  712.                len     = (png_ptr->width)&~7;
  713.                diff = (png_ptr->width)&7;
  714.                _asm
  715.                {
  716.                   movd       mm7, unmask       //load bit pattern
  717.                   psubb      mm6,mm6           //zero mm6
  718.                   punpcklbw  mm7,mm7
  719.                   punpcklwd  mm7,mm7
  720.                   punpckldq  mm7,mm7           //fill register with 8 masks
  721.                   movq       mm0,mask0
  722.                   movq       mm1,mask1
  723.                   movq       mm2,mask2
  724.                   movq       mm3,mask3
  725.                   movq       mm4,mask4
  726.                   movq       mm5,mask5
  727.                   pand       mm0,mm7
  728.                   pand       mm1,mm7
  729.                   pand       mm2,mm7
  730.                   pand       mm3,mm7
  731.                   pand       mm4,mm7
  732.                   pand       mm5,mm7
  733.                   pcmpeqb    mm0,mm6
  734.                   pcmpeqb    mm1,mm6
  735.                   pcmpeqb    mm2,mm6
  736.                   pcmpeqb    mm3,mm6
  737.                   pcmpeqb    mm4,mm6
  738.                   pcmpeqb    mm5,mm6
  739.                   mov        ecx,len           //load length of line
  740.                   mov        esi,srcptr        //load source
  741.                   mov        ebx,dstptr        //load dest
  742.                   cmp        ecx,0
  743.                   jz         mainloop48end
  744. mainloop48:
  745.                   movq       mm7,[esi]
  746.                   pand       mm7,mm0
  747.                   movq       mm6,mm0
  748.                   pandn      mm6,[ebx]
  749.                   por        mm7,mm6
  750.                   movq       [ebx],mm7
  751.                   movq       mm6,[esi+8]
  752.                   pand       mm6,mm1
  753.                   movq       mm7,mm1
  754.                   pandn      mm7,[ebx+8]
  755.                   por        mm6,mm7
  756.                   movq       [ebx+8],mm6
  757.                   movq       mm6,[esi+16]
  758.                   pand       mm6,mm2
  759.                   movq       mm7,mm2
  760.                   pandn      mm7,[ebx+16]
  761.                   por        mm6,mm7
  762.                   movq       [ebx+16],mm6
  763.                   movq       mm7,[esi+24]
  764.                   pand       mm7,mm3
  765.                   movq       mm6,mm3
  766.                   pandn      mm6,[ebx+24]
  767.                   por        mm7,mm6
  768.                   movq       [ebx+24],mm7
  769.                   movq       mm6,[esi+32]
  770.                   pand       mm6,mm4
  771.                   movq       mm7,mm4
  772.                   pandn      mm7,[ebx+32]
  773.                   por        mm6,mm7
  774.                   movq       [ebx+32],mm6
  775.                   movq       mm7,[esi+40]
  776.                   pand       mm7,mm5
  777.                   movq       mm6,mm5
  778.                   pandn      mm6,[ebx+40]
  779.                   por        mm7,mm6
  780.                   movq       [ebx+40],mm7
  781.                   add        esi,48            //inc by 32 bytes processed
  782.                   add        ebx,48
  783.                   sub        ecx,8             //dec by 8 pixels processed
  784.                   ja         mainloop48
  785. mainloop48end:
  786.                   mov        ecx,diff
  787.                   cmp        ecx,0
  788.                   jz         end48
  789.                   mov        edx,mask
  790.                   sal        edx,24            //make low byte the high byte
  791. secondloop48:
  792.                   sal        edx,1             //move high bit to CF
  793.                   jnc        skip48            //if CF = 0
  794.                   mov        eax,[esi]
  795.                   mov        [ebx],eax
  796. skip48:
  797.                   add        esi,4
  798.                   add        ebx,4
  799.                   dec        ecx
  800.                   jnz        secondloop48
  801. end48:
  802.                   emms
  803.                }
  804.             }
  805.             else /* mmx _not supported - Use modified C routine */
  806.             {
  807.                register unsigned int incr1, initial_val, final_val;
  808.                png_size_t pixel_bytes;
  809.                png_uint_32 i;
  810.                register int disp = png_pass_inc[png_ptr->pass];
  811.                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  812.                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  813.                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  814.                   pixel_bytes;
  815.                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  816.                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  817.                final_val = png_ptr->width*pixel_bytes;
  818.                incr1 = (disp)*pixel_bytes;
  819.                for (i = initial_val; i < final_val; i += incr1)
  820.                {
  821.                   png_memcpy(dstptr, srcptr, pixel_bytes);
  822.                   srcptr += incr1;
  823.                   dstptr += incr1;
  824.                }
  825.             } /* end of else */
  826.             break;
  827.          }       // end 48 bpp
  828.          default:
  829.          {
  830.             png_bytep sptr;
  831.             png_bytep dp;
  832.             png_size_t pixel_bytes;
  833.             int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  834.             unsigned int i;
  835.             register int disp = png_pass_inc[png_ptr->pass];  // get the offset
  836.             register unsigned int incr1, initial_val, final_val;
  837.             pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  838.             sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  839.                pixel_bytes;
  840.             dp = row + offset_table[png_ptr->pass]*pixel_bytes;
  841.             initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  842.             final_val = png_ptr->width*pixel_bytes;
  843.             incr1 = (disp)*pixel_bytes;
  844.             for (i = initial_val; i < final_val; i += incr1)
  845.             {
  846.                png_memcpy(dp, sptr, pixel_bytes);
  847.                sptr += incr1;
  848.                dp += incr1;
  849.             }
  850.             break;
  851.          }
  852.       } /* end switch (png_ptr->row_info.pixel_depth) */
  853.    } /* end if (non-trivial mask) */
  854. } /* end png_combine_row() */
  855. #if defined(PNG_READ_INTERLACING_SUPPORTED)
  856. void /* PRIVATE */
  857. png_do_read_interlace(png_structp png_ptr)
  858. {
  859.    png_row_infop row_info = &(png_ptr->row_info);
  860.    png_bytep row = png_ptr->row_buf + 1;
  861.    int pass = png_ptr->pass;
  862.    png_uint_32 transformations = png_ptr->transformations;
  863. #ifdef PNG_USE_LOCAL_ARRAYS
  864.    const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
  865. #endif
  866.    png_debug(1,"in png_do_read_interlacen");
  867.    if (mmx_supported == 2) {
  868. #if !defined(PNG_1_0_X)
  869.        /* this should have happened in png_init_mmx_flags() already */
  870.        png_warning(png_ptr, "asm_flags may not have been initialized");
  871. #endif
  872.        png_mmx_support();
  873.    }
  874.    if (row != NULL && row_info != NULL)
  875.    {
  876.       png_uint_32 final_width;
  877.       final_width = row_info->width * png_pass_inc[pass];
  878.       switch (row_info->pixel_depth)
  879.       {
  880.          case 1:
  881.          {
  882.             png_bytep sp, dp;
  883.             int sshift, dshift;
  884.             int s_start, s_end, s_inc;
  885.             png_byte v;
  886.             png_uint_32 i;
  887.             int j;
  888.             sp = row + (png_size_t)((row_info->width - 1) >> 3);
  889.             dp = row + (png_size_t)((final_width - 1) >> 3);
  890. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  891.             if (transformations & PNG_PACKSWAP)
  892.             {
  893.                sshift = (int)((row_info->width + 7) & 7);
  894.                dshift = (int)((final_width + 7) & 7);
  895.                s_start = 7;
  896.                s_end = 0;
  897.                s_inc = -1;
  898.             }
  899.             else
  900. #endif
  901.             {
  902.                sshift = 7 - (int)((row_info->width + 7) & 7);
  903.                dshift = 7 - (int)((final_width + 7) & 7);
  904.                s_start = 0;
  905.                s_end = 7;
  906.                s_inc = 1;
  907.             }
  908.             for (i = row_info->width; i; i--)
  909.             {
  910.                v = (png_byte)((*sp >> sshift) & 0x1);
  911.                for (j = 0; j < png_pass_inc[pass]; j++)
  912.                {
  913.                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
  914.                   *dp |= (png_byte)(v << dshift);
  915.                   if (dshift == s_end)
  916.                   {
  917.                      dshift = s_start;
  918.                      dp--;
  919.                   }
  920.                   else
  921.                      dshift += s_inc;
  922.                }
  923.                if (sshift == s_end)
  924.                {
  925.                   sshift = s_start;
  926.                   sp--;
  927.                }
  928.                else
  929.                   sshift += s_inc;
  930.             }
  931.             break;
  932.          }
  933.          case 2:
  934.          {
  935.             png_bytep sp, dp;
  936.             int sshift, dshift;
  937.             int s_start, s_end, s_inc;
  938.             png_uint_32 i;
  939.             sp = row + (png_size_t)((row_info->width - 1) >> 2);
  940.             dp = row + (png_size_t)((final_width - 1) >> 2);
  941. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  942.             if (transformations & PNG_PACKSWAP)
  943.             {
  944.                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
  945.                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
  946.                s_start = 6;
  947.                s_end = 0;
  948.                s_inc = -2;
  949.             }
  950.             else
  951. #endif
  952.             {
  953.                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
  954.                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
  955.                s_start = 0;
  956.                s_end = 6;
  957.                s_inc = 2;
  958.             }
  959.             for (i = row_info->width; i; i--)
  960.             {
  961.                png_byte v;
  962.                int j;
  963.                v = (png_byte)((*sp >> sshift) & 0x3);
  964.                for (j = 0; j < png_pass_inc[pass]; j++)
  965.                {
  966.                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
  967.                   *dp |= (png_byte)(v << dshift);
  968.                   if (dshift == s_end)
  969.                   {
  970.                      dshift = s_start;
  971.                      dp--;
  972.                   }
  973.                   else
  974.                      dshift += s_inc;
  975.                }
  976.                if (sshift == s_end)
  977.                {
  978.                   sshift = s_start;
  979.                   sp--;
  980.                }
  981.                else
  982.                   sshift += s_inc;
  983.             }
  984.             break;
  985.          }
  986.          case 4:
  987.          {
  988.             png_bytep sp, dp;
  989.             int sshift, dshift;
  990.             int s_start, s_end, s_inc;
  991.             png_uint_32 i;
  992.             sp = row + (png_size_t)((row_info->width - 1) >> 1);
  993.             dp = row + (png_size_t)((final_width - 1) >> 1);
  994. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  995.             if (transformations & PNG_PACKSWAP)
  996.             {
  997.                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
  998.                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
  999.                s_start = 4;
  1000.                s_end = 0;
  1001.                s_inc = -4;
  1002.             }
  1003.             else
  1004. #endif
  1005.             {
  1006.                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
  1007.                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
  1008.                s_start = 0;
  1009.                s_end = 4;
  1010.                s_inc = 4;
  1011.             }
  1012.             for (i = row_info->width; i; i--)
  1013.             {
  1014.                png_byte v;
  1015.                int j;
  1016.                v = (png_byte)((*sp >> sshift) & 0xf);
  1017.                for (j = 0; j < png_pass_inc[pass]; j++)
  1018.                {
  1019.                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
  1020.                   *dp |= (png_byte)(v << dshift);
  1021.                   if (dshift == s_end)
  1022.                   {
  1023.                      dshift = s_start;
  1024.                      dp--;
  1025.                   }
  1026.                   else
  1027.                      dshift += s_inc;
  1028.                }
  1029.                if (sshift == s_end)
  1030.                {
  1031.                   sshift = s_start;
  1032.                   sp--;
  1033.                }
  1034.                else
  1035.                   sshift += s_inc;
  1036.             }
  1037.             break;
  1038.          }
  1039.          default:         // This is the place where the routine is modified
  1040.          {
  1041.             __int64 const4 = 0x0000000000FFFFFF;
  1042.             // __int64 const5 = 0x000000FFFFFF0000;  // unused...
  1043.             __int64 const6 = 0x00000000000000FF;
  1044.             png_bytep sptr, dp;
  1045.             png_uint_32 i;
  1046.             png_size_t pixel_bytes;
  1047.             int width = row_info->width;
  1048.             pixel_bytes = (row_info->pixel_depth >> 3);
  1049.             sptr = row + (width - 1) * pixel_bytes;
  1050.             dp = row + (final_width - 1) * pixel_bytes;
  1051.             // New code by Nirav Chhatrapati - Intel Corporation
  1052.             // sign fix by GRR
  1053.             // NOTE:  there is NO MMX code for 48-bit and 64-bit images
  1054.             // use MMX routine if machine supports it
  1055. #if !defined(PNG_1_0_X)
  1056.             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
  1057.                 /* && mmx_supported */ )
  1058. #else
  1059.             if (mmx_supported)
  1060. #endif
  1061.             {
  1062.                if (pixel_bytes == 3)
  1063.                {
  1064.                   if (((pass == 0) || (pass == 1)) && width)
  1065.                   {
  1066.                      _asm
  1067.                      {
  1068.                         mov esi, sptr
  1069.                         mov edi, dp
  1070.                         mov ecx, width
  1071.                         sub edi, 21   // (png_pass_inc[pass] - 1)*pixel_bytes
  1072. loop_pass0:
  1073.                         movd mm0, [esi]     ; X X X X X v2 v1 v0
  1074.                         pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
  1075.                         movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
  1076.                         psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
  1077.                         movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
  1078.                         psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
  1079.                         psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
  1080.                         por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
  1081.                         por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
  1082.                         movq mm3, mm0       ; v2 v1 v0 v2 v1 v0 v2 v1
  1083.                         psllq mm0, 16       ; v0 v2 v1 v0 v2 v1 0 0
  1084.                         movq mm4, mm3       ; v2 v1 v0 v2 v1 v0 v2 v1
  1085.                         punpckhdq mm3, mm0  ; v0 v2 v1 v0 v2 v1 v0 v2
  1086.                         movq [edi+16] , mm4
  1087.                         psrlq mm0, 32       ; 0 0 0 0 v0 v2 v1 v0
  1088.                         movq [edi+8] , mm3
  1089.                         punpckldq mm0, mm4  ; v1 v0 v2 v1 v0 v2 v1 v0
  1090.                         sub esi, 3
  1091.                         movq [edi], mm0
  1092.                         sub edi, 24
  1093.                         //sub esi, 3
  1094.                         dec ecx
  1095.                         jnz loop_pass0
  1096.                         EMMS
  1097.                      }
  1098.                   }
  1099.                   else if (((pass == 2) || (pass == 3)) && width)
  1100.                   {
  1101.                      _asm
  1102.                      {
  1103.                         mov esi, sptr
  1104.                         mov edi, dp
  1105.                         mov ecx, width
  1106.                         sub edi, 9   // (png_pass_inc[pass] - 1)*pixel_bytes
  1107. loop_pass2:
  1108.                         movd mm0, [esi]     ; X X X X X v2 v1 v0
  1109.                         pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
  1110.                         movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
  1111.                         psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
  1112.                         movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
  1113.                         psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
  1114.                         psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
  1115.                         por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
  1116.                         por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
  1117.                         movq [edi+4], mm0   ; move to memory
  1118.                         psrlq mm0, 16       ; 0 0 v2 v1 v0 v2 v1 v0
  1119.                         movd [edi], mm0     ; move to memory
  1120.                         sub esi, 3
  1121.                         sub edi, 12
  1122.                         dec ecx
  1123.                         jnz loop_pass2
  1124.                         EMMS
  1125.                      }
  1126.                   }
  1127.                   else if (width) /* && ((pass == 4) || (pass == 5)) */
  1128.                   {
  1129.                      int width_mmx = ((width >> 1) << 1) - 8;
  1130.                      if (width_mmx < 0)
  1131.                          width_mmx = 0;
  1132.                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
  1133.                      if (width_mmx)
  1134.                      {
  1135.                         _asm
  1136.                         {
  1137.                            mov esi, sptr
  1138.                            mov edi, dp
  1139.                            mov ecx, width_mmx
  1140.                            sub esi, 3
  1141.                            sub edi, 9
  1142. loop_pass4:
  1143.                            movq mm0, [esi]     ; X X v2 v1 v0 v5 v4 v3
  1144.                            movq mm7, mm0       ; X X v2 v1 v0 v5 v4 v3
  1145.                            movq mm6, mm0       ; X X v2 v1 v0 v5 v4 v3
  1146.                            psllq mm0, 24       ; v1 v0 v5 v4 v3 0 0 0
  1147.                            pand mm7, const4    ; 0 0 0 0 0 v5 v4 v3
  1148.                            psrlq mm6, 24       ; 0 0 0 X X v2 v1 v0
  1149.                            por mm0, mm7        ; v1 v0 v5 v4 v3 v5 v4 v3
  1150.                            movq mm5, mm6       ; 0 0 0 X X v2 v1 v0
  1151.                            psllq mm6, 8        ; 0 0 X X v2 v1 v0 0
  1152.                            movq [edi], mm0     ; move quad to memory
  1153.                            psrlq mm5, 16       ; 0 0 0 0 0 X X v2
  1154.                            pand mm5, const6    ; 0 0 0 0 0 0 0 v2
  1155.                            por mm6, mm5        ; 0 0 X X v2 v1 v0 v2
  1156.                            movd [edi+8], mm6   ; move double to memory
  1157.                            sub esi, 6
  1158.                            sub edi, 12
  1159.                            sub ecx, 2
  1160.                            jnz loop_pass4
  1161.                            EMMS
  1162.                         }
  1163.                      }
  1164.                      sptr -= width_mmx*3;
  1165.                      dp -= width_mmx*6;
  1166.                      for (i = width; i; i--)
  1167.                      {
  1168.                         png_byte v[8];
  1169.                         int j;
  1170.                         png_memcpy(v, sptr, 3);
  1171.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1172.                         {
  1173.                            png_memcpy(dp, v, 3);
  1174.                            dp -= 3;
  1175.                         }
  1176.                         sptr -= 3;
  1177.                      }
  1178.                   }
  1179.                } /* end of pixel_bytes == 3 */
  1180.                else if (pixel_bytes == 1)
  1181.                {
  1182.                   if (((pass == 0) || (pass == 1)) && width)
  1183.                   {
  1184.                      int width_mmx = ((width >> 2) << 2);
  1185.                      width -= width_mmx;
  1186.                      if (width_mmx)
  1187.                      {
  1188.                         _asm
  1189.                         {
  1190.                            mov esi, sptr
  1191.                            mov edi, dp
  1192.                            mov ecx, width_mmx
  1193.                            sub edi, 31
  1194.                            sub esi, 3
  1195. loop1_pass0:
  1196.                            movd mm0, [esi]     ; X X X X v0 v1 v2 v3
  1197.                            movq mm1, mm0       ; X X X X v0 v1 v2 v3
  1198.                            punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
  1199.                            movq mm2, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
  1200.                            punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
  1201.                            movq mm3, mm0       ; v2 v2 v2 v2 v3 v3 v3 v3
  1202.                            punpckldq mm0, mm0  ; v3 v3 v3 v3 v3 v3 v3 v3
  1203.                            punpckhdq mm3, mm3  ; v2 v2 v2 v2 v2 v2 v2 v2
  1204.                            movq [edi], mm0     ; move to memory v3
  1205.                            punpckhwd mm2, mm2  ; v0 v0 v0 v0 v1 v1 v1 v1
  1206.                            movq [edi+8], mm3   ; move to memory v2
  1207.                            movq mm4, mm2       ; v0 v0 v0 v0 v1 v1 v1 v1
  1208.                            punpckldq mm2, mm2  ; v1 v1 v1 v1 v1 v1 v1 v1
  1209.                            punpckhdq mm4, mm4  ; v0 v0 v0 v0 v0 v0 v0 v0
  1210.                            movq [edi+16], mm2  ; move to memory v1
  1211.                            movq [edi+24], mm4  ; move to memory v0
  1212.                            sub esi, 4
  1213.                            sub edi, 32
  1214.                            sub ecx, 4
  1215.                            jnz loop1_pass0
  1216.                            EMMS
  1217.                         }
  1218.                      }
  1219.                      sptr -= width_mmx;
  1220.                      dp -= width_mmx*8;
  1221.                      for (i = width; i; i--)
  1222.                      {
  1223.                         int j;
  1224.                        /* I simplified this part in version 1.0.4e
  1225.                         * here and in several other instances where
  1226.                         * pixel_bytes == 1  -- GR-P
  1227.                         *
  1228.                         * Original code:
  1229.                         *
  1230.                         * png_byte v[8];
  1231.                         * png_memcpy(v, sptr, pixel_bytes);
  1232.                         * for (j = 0; j < png_pass_inc[pass]; j++)
  1233.                         * {
  1234.                         *    png_memcpy(dp, v, pixel_bytes);
  1235.                         *    dp -= pixel_bytes;
  1236.                         * }
  1237.                         * sptr -= pixel_bytes;
  1238.                         *
  1239.                         * Replacement code is in the next three lines:
  1240.                         */
  1241.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1242.                            *dp-- = *sptr;
  1243.                         sptr--;
  1244.                      }
  1245.                   }
  1246.                   else if (((pass == 2) || (pass == 3)) && width)
  1247.                   {
  1248.                      int width_mmx = ((width >> 2) << 2);
  1249.                      width -= width_mmx;
  1250.                      if (width_mmx)
  1251.                      {
  1252.                         _asm
  1253.                         {
  1254.                            mov esi, sptr
  1255.                            mov edi, dp
  1256.                            mov ecx, width_mmx
  1257.                            sub edi, 15
  1258.                            sub esi, 3
  1259. loop1_pass2:
  1260.                            movd mm0, [esi]     ; X X X X v0 v1 v2 v3
  1261.                            punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
  1262.                            movq mm1, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
  1263.                            punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
  1264.                            punpckhwd mm1, mm1  ; v0 v0 v0 v0 v1 v1 v1 v1
  1265.                            movq [edi], mm0     ; move to memory v2 and v3
  1266.                            sub esi, 4
  1267.                            movq [edi+8], mm1   ; move to memory v1     and v0
  1268.                            sub edi, 16
  1269.                            sub ecx, 4
  1270.                            jnz loop1_pass2
  1271.                            EMMS
  1272.                         }
  1273.                      }
  1274.                      sptr -= width_mmx;
  1275.                      dp -= width_mmx*4;
  1276.                      for (i = width; i; i--)
  1277.                      {
  1278.                         int j;
  1279.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1280.                         {
  1281.                            *dp-- = *sptr;
  1282.                         }
  1283.                         sptr --;
  1284.                      }
  1285.                   }
  1286.                   else if (width) /* && ((pass == 4) || (pass == 5))) */
  1287.                   {
  1288.                      int width_mmx = ((width >> 3) << 3);
  1289.                      width -= width_mmx;
  1290.                      if (width_mmx)
  1291.                      {
  1292.                         _asm
  1293.                         {
  1294.                            mov esi, sptr
  1295.                            mov edi, dp
  1296.                            mov ecx, width_mmx
  1297.                            sub edi, 15
  1298.                            sub esi, 7
  1299. loop1_pass4:
  1300.                            movq mm0, [esi]     ; v0 v1 v2 v3 v4 v5 v6 v7
  1301.                            movq mm1, mm0       ; v0 v1 v2 v3 v4 v5 v6 v7
  1302.                            punpcklbw mm0, mm0  ; v4 v4 v5 v5 v6 v6 v7 v7
  1303.                            //movq mm1, mm0     ; v0 v0 v1 v1 v2 v2 v3 v3
  1304.                            punpckhbw mm1, mm1  ;v0 v0 v1 v1 v2 v2 v3 v3
  1305.                            movq [edi+8], mm1   ; move to memory v0 v1 v2 and v3
  1306.                            sub esi, 8
  1307.                            movq [edi], mm0     ; move to memory v4 v5 v6 and v7
  1308.                            //sub esi, 4
  1309.                            sub edi, 16
  1310.                            sub ecx, 8
  1311.                            jnz loop1_pass4
  1312.                            EMMS
  1313.                         }
  1314.                      }
  1315.                      sptr -= width_mmx;
  1316.                      dp -= width_mmx*2;
  1317.                      for (i = width; i; i--)
  1318.                      {
  1319.                         int j;
  1320.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1321.                         {
  1322.                            *dp-- = *sptr;
  1323.                         }
  1324.                         sptr --;
  1325.                      }
  1326.                   }
  1327.                } /* end of pixel_bytes == 1 */
  1328.                else if (pixel_bytes == 2)
  1329.                {
  1330.                   if (((pass == 0) || (pass == 1)) && width)
  1331.                   {
  1332.                      int width_mmx = ((width >> 1) << 1);
  1333.                      width -= width_mmx;
  1334.                      if (width_mmx)
  1335.                      {
  1336.                         _asm
  1337.                         {
  1338.                            mov esi, sptr
  1339.                            mov edi, dp
  1340.                            mov ecx, width_mmx
  1341.                            sub esi, 2
  1342.                            sub edi, 30
  1343. loop2_pass0:
  1344.                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
  1345.                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
  1346.                            movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
  1347.                            punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
  1348.                            punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
  1349.                            movq [edi], mm0
  1350.                            movq [edi + 8], mm0
  1351.                            movq [edi + 16], mm1
  1352.                            movq [edi + 24], mm1
  1353.                            sub esi, 4
  1354.                            sub edi, 32
  1355.                            sub ecx, 2
  1356.                            jnz loop2_pass0
  1357.                            EMMS
  1358.                         }
  1359.                      }
  1360.                      sptr -= (width_mmx*2 - 2);            // sign fixed
  1361.                      dp -= (width_mmx*16 - 2);            // sign fixed
  1362.                      for (i = width; i; i--)
  1363.                      {
  1364.                         png_byte v[8];
  1365.                         int j;
  1366.                         sptr -= 2;
  1367.                         png_memcpy(v, sptr, 2);
  1368.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1369.                         {
  1370.                            dp -= 2;
  1371.                            png_memcpy(dp, v, 2);
  1372.                         }
  1373.                      }
  1374.                   }
  1375.                   else if (((pass == 2) || (pass == 3)) && width)
  1376.                   {
  1377.                      int width_mmx = ((width >> 1) << 1) ;
  1378.                      width -= width_mmx;
  1379.                      if (width_mmx)
  1380.                      {
  1381.                         _asm
  1382.                         {
  1383.                            mov esi, sptr
  1384.                            mov edi, dp
  1385.                            mov ecx, width_mmx
  1386.                            sub esi, 2
  1387.                            sub edi, 14
  1388. loop2_pass2:
  1389.                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
  1390.                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
  1391.                            movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
  1392.                            punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
  1393.                            punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
  1394.                            movq [edi], mm0
  1395.                            sub esi, 4
  1396.                            movq [edi + 8], mm1
  1397.                            //sub esi, 4
  1398.                            sub edi, 16
  1399.                            sub ecx, 2
  1400.                            jnz loop2_pass2
  1401.                            EMMS
  1402.                         }
  1403.                      }
  1404.                      sptr -= (width_mmx*2 - 2);            // sign fixed
  1405.                      dp -= (width_mmx*8 - 2);            // sign fixed
  1406.                      for (i = width; i; i--)
  1407.                      {
  1408.                         png_byte v[8];
  1409.                         int j;
  1410.                         sptr -= 2;
  1411.                         png_memcpy(v, sptr, 2);
  1412.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1413.                         {
  1414.                            dp -= 2;
  1415.                            png_memcpy(dp, v, 2);
  1416.                         }
  1417.                      }
  1418.                   }
  1419.                   else if (width)  // pass == 4 or 5
  1420.                   {
  1421.                      int width_mmx = ((width >> 1) << 1) ;
  1422.                      width -= width_mmx;
  1423.                      if (width_mmx)
  1424.                      {
  1425.                         _asm
  1426.                         {
  1427.                            mov esi, sptr
  1428.                            mov edi, dp
  1429.                            mov ecx, width_mmx
  1430.                            sub esi, 2
  1431.                            sub edi, 6
  1432. loop2_pass4:
  1433.                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
  1434.                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
  1435.                            sub esi, 4
  1436.                            movq [edi], mm0
  1437.                            sub edi, 8
  1438.                            sub ecx, 2
  1439.                            jnz loop2_pass4
  1440.                            EMMS
  1441.                         }
  1442.                      }
  1443.                      sptr -= (width_mmx*2 - 2);            // sign fixed
  1444.                      dp -= (width_mmx*4 - 2);            // sign fixed
  1445.                      for (i = width; i; i--)
  1446.                      {
  1447.                         png_byte v[8];
  1448.                         int j;
  1449.                         sptr -= 2;
  1450.                         png_memcpy(v, sptr, 2);
  1451.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1452.                         {
  1453.                            dp -= 2;
  1454.                            png_memcpy(dp, v, 2);
  1455.                         }
  1456.                      }
  1457.                   }
  1458.                } /* end of pixel_bytes == 2 */
  1459.                else if (pixel_bytes == 4)
  1460.                {
  1461.                   if (((pass == 0) || (pass == 1)) && width)
  1462.                   {
  1463.                      int width_mmx = ((width >> 1) << 1) ;
  1464.                      width -= width_mmx;
  1465.                      if (width_mmx)
  1466.                      {
  1467.                         _asm
  1468.                         {
  1469.                            mov esi, sptr
  1470.                            mov edi, dp
  1471.                            mov ecx, width_mmx
  1472.                            sub esi, 4
  1473.                            sub edi, 60
  1474. loop4_pass0:
  1475.                            movq mm0, [esi]        ; v3 v2 v1 v0 v7 v6 v5 v4
  1476.                            movq mm1, mm0          ; v3 v2 v1 v0 v7 v6 v5 v4
  1477.                            punpckldq mm0, mm0     ; v7 v6 v5 v4 v7 v6 v5 v4
  1478.                            punpckhdq mm1, mm1     ; v3 v2 v1 v0 v3 v2 v1 v0
  1479.                            movq [edi], mm0
  1480.                            movq [edi + 8], mm0
  1481.                            movq [edi + 16], mm0
  1482.                            movq [edi + 24], mm0
  1483.                            movq [edi+32], mm1
  1484.                            movq [edi + 40], mm1
  1485.                            movq [edi+ 48], mm1
  1486.                            sub esi, 8
  1487.                            movq [edi + 56], mm1
  1488.                            sub edi, 64
  1489.                            sub ecx, 2
  1490.                            jnz loop4_pass0
  1491.                            EMMS
  1492.                         }
  1493.                      }
  1494.                      sptr -= (width_mmx*4 - 4);            // sign fixed
  1495.                      dp -= (width_mmx*32 - 4);            // sign fixed
  1496.                      for (i = width; i; i--)
  1497.                      {
  1498.                         png_byte v[8];
  1499.                         int j;
  1500.                         sptr -= 4;
  1501.                         png_memcpy(v, sptr, 4);
  1502.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1503.                         {
  1504.                            dp -= 4;
  1505.                            png_memcpy(dp, v, 4);
  1506.                         }
  1507.                      }
  1508.                   }
  1509.                   else if (((pass == 2) || (pass == 3)) && width)
  1510.                   {
  1511.                      int width_mmx = ((width >> 1) << 1) ;
  1512.                      width -= width_mmx;
  1513.                      if (width_mmx)
  1514.                      {
  1515.                         _asm
  1516.                         {
  1517.                            mov esi, sptr
  1518.                            mov edi, dp
  1519.                            mov ecx, width_mmx
  1520.                            sub esi, 4
  1521.                            sub edi, 28
  1522. loop4_pass2:
  1523.                            movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
  1524.                            movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
  1525.                            punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
  1526.                            punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
  1527.                            movq [edi], mm0
  1528.                            movq [edi + 8], mm0
  1529.                            movq [edi+16], mm1
  1530.                            movq [edi + 24], mm1
  1531.                            sub esi, 8
  1532.                            sub edi, 32
  1533.                            sub ecx, 2
  1534.                            jnz loop4_pass2
  1535.                            EMMS
  1536.                         }
  1537.                      }
  1538.                      sptr -= (width_mmx*4 - 4);            // sign fixed
  1539.                      dp -= (width_mmx*16 - 4);            // sign fixed
  1540.                      for (i = width; i; i--)
  1541.                      {
  1542.                         png_byte v[8];
  1543.                         int j;
  1544.                         sptr -= 4;
  1545.                         png_memcpy(v, sptr, 4);
  1546.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1547.                         {
  1548.                            dp -= 4;
  1549.                            png_memcpy(dp, v, 4);
  1550.                         }
  1551.                      }
  1552.                   }
  1553.                   else if (width)  // pass == 4 or 5
  1554.                   {
  1555.                      int width_mmx = ((width >> 1) << 1) ;
  1556.                      width -= width_mmx;
  1557.                      if (width_mmx)
  1558.                      {
  1559.                         _asm
  1560.                         {
  1561.                            mov esi, sptr
  1562.                            mov edi, dp
  1563.                            mov ecx, width_mmx
  1564.                            sub esi, 4
  1565.                            sub edi, 12
  1566. loop4_pass4:
  1567.                            movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
  1568.                            movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
  1569.                            punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
  1570.                            punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
  1571.                            movq [edi], mm0
  1572.                            sub esi, 8
  1573.                            movq [edi + 8], mm1
  1574.                            sub edi, 16
  1575.                            sub ecx, 2
  1576.                            jnz loop4_pass4
  1577.                            EMMS
  1578.                         }
  1579.                      }
  1580.                      sptr -= (width_mmx*4 - 4);          // sign fixed
  1581.                      dp -= (width_mmx*8 - 4);            // sign fixed
  1582.                      for (i = width; i; i--)
  1583.                      {
  1584.                         png_byte v[8];
  1585.                         int j;
  1586.                         sptr -= 4;
  1587.                         png_memcpy(v, sptr, 4);
  1588.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1589.                         {
  1590.                            dp -= 4;
  1591.                            png_memcpy(dp, v, 4);
  1592.                         }
  1593.                      }
  1594.                   }
  1595.                } /* end of pixel_bytes == 4 */
  1596.                else if (pixel_bytes == 6)
  1597.                {
  1598.                   for (i = width; i; i--)
  1599.                   {
  1600.                      png_byte v[8];
  1601.                      int j;
  1602.                      png_memcpy(v, sptr, 6);
  1603.                      for (j = 0; j < png_pass_inc[pass]; j++)
  1604.                      {
  1605.                         png_memcpy(dp, v, 6);
  1606.                         dp -= 6;
  1607.                      }
  1608.                      sptr -= 6;
  1609.                   }
  1610.                } /* end of pixel_bytes == 6 */
  1611.                else
  1612.                {
  1613.                   for (i = width; i; i--)
  1614.                   {
  1615.                      png_byte v[8];
  1616.                      int j;
  1617.                      png_memcpy(v, sptr, pixel_bytes);
  1618.                      for (j = 0; j < png_pass_inc[pass]; j++)
  1619.                      {
  1620.                         png_memcpy(dp, v, pixel_bytes);
  1621.                         dp -= pixel_bytes;
  1622.                      }
  1623.                      sptr-= pixel_bytes;
  1624.                   }
  1625.                }
  1626.             } /* end of mmx_supported */
  1627.             else /* MMX not supported:  use modified C code - takes advantage
  1628.                   * of inlining of memcpy for a constant */
  1629.             {
  1630.                if (pixel_bytes == 1)
  1631.                {
  1632.                   for (i = width; i; i--)
  1633.                   {
  1634.                      int j;
  1635.                      for (j = 0; j < png_pass_inc[pass]; j++)
  1636.                         *dp-- = *sptr;
  1637.                      sptr--;
  1638.                   }
  1639.                }
  1640.                else if (pixel_bytes == 3)
  1641.                {
  1642.                   for (i = width; i; i--)
  1643.                   {
  1644.                      png_byte v[8];
  1645.                      int j;
  1646.                      png_memcpy(v, sptr, pixel_bytes);
  1647.                      for (j = 0; j < png_pass_inc[pass]; j++)
  1648.                      {
  1649.                         png_memcpy(dp, v, pixel_bytes);
  1650.                         dp -= pixel_bytes;
  1651.                      }
  1652.                      sptr -= pixel_bytes;
  1653.                   }
  1654.                }
  1655.                else if (pixel_bytes == 2)
  1656.                {
  1657.                   for (i = width; i; i--)
  1658.                   {
  1659.                      png_byte v[8];
  1660.                      int j;
  1661.                      png_memcpy(v, sptr, pixel_bytes);
  1662.                      for (j = 0; j < png_pass_inc[pass]; j++)
  1663.                      {
  1664.                         png_memcpy(dp, v, pixel_bytes);
  1665.                         dp -= pixel_bytes;
  1666.                      }
  1667.                      sptr -= pixel_bytes;
  1668.                   }
  1669.                }
  1670.                else if (pixel_bytes == 4)
  1671.                {
  1672.                   for (i = width; i; i--)
  1673.                   {
  1674.                      png_byte v[8];
  1675.                      int j;
  1676.                      png_memcpy(v, sptr, pixel_bytes);
  1677.                      for (j = 0; j < png_pass_inc[pass]; j++)
  1678.                      {
  1679.                         png_memcpy(dp, v, pixel_bytes);
  1680.                         dp -= pixel_bytes;
  1681.                      }
  1682.                      sptr -= pixel_bytes;
  1683.                   }
  1684.                }
  1685.                else if (pixel_bytes == 6)
  1686.                {
  1687.                   for (i = width; i; i--)
  1688.                   {
  1689.                      png_byte v[8];
  1690.                      int j;
  1691.                      png_memcpy(v, sptr, pixel_bytes);
  1692.                      for (j = 0; j < png_pass_inc[pass]; j++)
  1693.                      {
  1694.                         png_memcpy(dp, v, pixel_bytes);
  1695.                         dp -= pixel_bytes;
  1696.                      }
  1697.                      sptr -= pixel_bytes;
  1698.                   }
  1699.                }
  1700.                else
  1701.                {
  1702.                   for (i = width; i; i--)
  1703.                   {
  1704.                      png_byte v[8];
  1705.                      int j;
  1706.                      png_memcpy(v, sptr, pixel_bytes);
  1707.                      for (j = 0; j < png_pass_inc[pass]; j++)
  1708.                      {
  1709.                         png_memcpy(dp, v, pixel_bytes);
  1710.                         dp -= pixel_bytes;
  1711.                      }
  1712.                      sptr -= pixel_bytes;
  1713.                   }
  1714.                }
  1715.             } /* end of MMX not supported */
  1716.             break;
  1717.          }
  1718.       } /* end switch (row_info->pixel_depth) */
  1719.       row_info->width = final_width;
  1720.       row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
  1721.    }
  1722. }
  1723. #endif /* PNG_READ_INTERLACING_SUPPORTED */
  1724. // These variables are utilized in the functions below.  They are declared
  1725. // globally here to ensure alignment on 8-byte boundaries.
  1726. union uAll {
  1727.    __int64 use;
  1728.    double  align;
  1729. } LBCarryMask = {0x0101010101010101},
  1730.   HBClearMask = {0x7f7f7f7f7f7f7f7f},
  1731.   ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
  1732. // Optimized code for PNG Average filter decoder
  1733. void /* PRIVATE */
  1734. png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
  1735.                             , png_bytep prev_row)
  1736. {
  1737.    int bpp;
  1738.    png_uint_32 FullLength;
  1739.    png_uint_32 MMXLength;
  1740.    //png_uint_32 len;
  1741.    int diff;
  1742.    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
  1743.    FullLength  = row_info->rowbytes; // # of bytes to filter
  1744.    _asm {
  1745.          // Init address pointers and offset
  1746.          mov edi, row          // edi ==> Avg(x)
  1747.          xor ebx, ebx          // ebx ==> x
  1748.          mov edx, edi
  1749.          mov esi, prev_row           // esi ==> Prior(x)
  1750.          sub edx, bpp          // edx ==> Raw(x-bpp)
  1751.          xor eax, eax
  1752.          // Compute the Raw value for the first bpp bytes
  1753.          //    Raw(x) = Avg(x) + (Prior(x)/2)
  1754. davgrlp:
  1755.          mov al, [esi + ebx]   // Load al with Prior(x)
  1756.          inc ebx