pnggccrd.c
上传用户:looem2003
上传日期:2014-07-20
资源大小:13733k
文件大小:230k
源码类别:

打印编程

开发平台:

Visual C++

  1. /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
  2.  *
  3.  * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
  4.  *
  5.  *     See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
  6.  *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
  7.  *     for Intel's performance analysis of the MMX vs. non-MMX code.
  8.  *
  9.  * Last changed in libpng 1.2.15 December 31, 2006
  10.  * For conditions of distribution and use, see copyright notice in png.h
  11.  * Copyright (c) 1998-2006 Glenn Randers-Pehrson
  12.  * Copyright (c) 1998, Intel Corporation
  13.  *
  14.  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
  15.  * Interface to libpng contributed by Gilles Vollant, 1999.
  16.  * GNU C port by Greg Roelofs, 1999-2001.
  17.  *
  18.  * Lines 2350-4300 converted in place with intel2gas 1.3.1:
  19.  *
  20.  *   intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
  21.  *
  22.  * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
  23.  *
  24.  * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
  25.  *        is required to assemble the newer MMX instructions such as movq.
  26.  *        For djgpp, see
  27.  *
  28.  *           ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
  29.  *
  30.  *        (or a later version in the same directory).  For Linux, check your
  31.  *        distribution's web site(s) or try these links:
  32.  *
  33.  *           http://rufus.w3.org/linux/RPM/binutils.html
  34.  *           http://www.debian.org/Packages/stable/devel/binutils.html
  35.  *           ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
  36.  *             binutils.tgz
  37.  *
  38.  *        For other platforms, see the main GNU site:
  39.  *
  40.  *           ftp://ftp.gnu.org/pub/gnu/binutils/
  41.  *
  42.  *        Version 2.5.2l.15 is definitely too old...
  43.  */
  44. /*
  45.  * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
  46.  * =====================================
  47.  *
  48.  * 19991006:
  49.  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
  50.  *
  51.  * 19991007:
  52.  *  - additional optimizations (possible or definite):
  53.  *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
  54.  *     - write MMX code for 48-bit case (pixel_bytes == 6)
  55.  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
  56.  *        why subtract 8 from width_mmx in the pass 4/5 case?
  57.  *        (only width_mmx case) (near line 1606)
  58.  *     x [DONE] replace pixel_bytes within each block with the true
  59.  *        constant value (or are compilers smart enough to do that?)
  60.  *     - rewrite all MMX interlacing code so it's aligned with
  61.  *        the *beginning* of the row buffer, not the end.  This
  62.  *        would not only allow one to eliminate half of the memory
  63.  *        writes for odd passes (that is, pass == odd), it may also
  64.  *        eliminate some unaligned-data-access exceptions (assuming
  65.  *        there's a penalty for not aligning 64-bit accesses on
  66.  *        64-bit boundaries).  The only catch is that the "leftover"
  67.  *        pixel(s) at the end of the row would have to be saved,
  68.  *        but there are enough unused MMX registers in every case,
  69.  *        so this is not a problem.  A further benefit is that the
  70.  *        post-MMX cleanup code (C code) in at least some of the
  71.  *        cases could be done within the assembler block.
  72.  *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
  73.  *     inconsistent, and don't match the MMX Programmer's Reference
  74.  *     Manual conventions anyway.  They should be changed to
  75.  *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
  76.  *     was lowest in memory (e.g., corresponding to a left pixel)
  77.  *     and b7 is the byte that was highest (e.g., a right pixel).
  78.  *
  79.  * 19991016:
  80.  *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
  81.  *     want globals prefixed by underscores when referencing them--
  82.  *     i.e., if the variable is const4, then refer to it as const4,
  83.  *     not _const4.  This seems to be a djgpp-specific requirement.
  84.  *     Also, such variables apparently *must* be declared outside
  85.  *     of functions; neither static nor automatic variables work if
  86.  *     defined within the scope of a single function, but both
  87.  *     static and truly global (multi-module) variables work fine.
  88.  *
  89.  * 19991023:
  90.  *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
  91.  *  - switched from string-concatenation-with-macros to cleaner method of
  92.  *     renaming global variables for djgpp--i.e., always use prefixes in
  93.  *     inlined assembler code (== strings) and conditionally rename the
  94.  *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
  95.  *
  96.  * 19991024:
  97.  *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
  98.  *     This one was severely weird:  even though mmxsupport() doesn't touch
  99.  *     ebx (where "row" pointer was stored), it nevertheless managed to zero
  100.  *     the register (even in static/non-fPIC code--see below), which in turn
  101.  *     caused png_do_read_interlace() to return prematurely on the first row of
  102.  *     interlaced images (i.e., without expanding the interlaced pixels).
  103.  *     Inspection of the generated assembly code didn't turn up any clues,
  104.  *     although it did point at a minor optimization (i.e., get rid of
  105.  *     mmx_supported_local variable and just use eax).  Possibly the CPUID
  106.  *     instruction is more destructive than it looks?  (Not yet checked.)
  107.  *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
  108.  *     listings...  Apparently register spillage has to do with ebx, since
  109.  *     it's used to index the global offset table.  Commenting it out of the
  110.  *     input-reg lists in png_combine_row() eliminated compiler barfage, so
  111.  *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
  112.  *
  113.  * 19991107:
  114.  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
  115.  *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
  116.  *
  117.  * 19991120:
  118.  *  - made "diff" variable (now "_dif") global to simplify conversion of
  119.  *     filtering routines (running out of regs, sigh).  "diff" is still used
  120.  *     in interlacing routines, however.
  121.  *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
  122.  *     macro determines which is used); original not yet tested.
  123.  *
  124.  * 20000213:
  125.  *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
  126.  *
  127.  * 20000319:
  128.  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
  129.  *     pass == 4 or 5, that caused visible corruption of interlaced images
  130.  *
  131.  * 20000623:
  132.  *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
  133.  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
  134.  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
  135.  *     Chuck Wilson supplied a patch involving dummy output registers.  See
  136.  *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
  137.  *     for the original (anonymous) SourceForge bug report.
  138.  *
  139.  * 20000706:
  140.  *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
  141.  *       pnggccrd.c: In function `png_combine_row':
  142.  *       pnggccrd.c:525: more than 10 operands in `asm'
  143.  *       pnggccrd.c:669: more than 10 operands in `asm'
  144.  *       pnggccrd.c:828: more than 10 operands in `asm'
  145.  *       pnggccrd.c:994: more than 10 operands in `asm'
  146.  *       pnggccrd.c:1177: more than 10 operands in `asm'
  147.  *     They are all the same problem and can be worked around by using the
  148.  *     global _unmask variable unconditionally, not just in the -fPIC case.
  149.  *     Reportedly earlier versions of gcc also have the problem with more than
  150.  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
  151.  *
  152.  * 20000729:
  153.  *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
  154.  *     MMX routine); began converting png_read_filter_row_mmx_sub()
  155.  *  - to finish remaining sections:
  156.  *     - clean up indentation and comments
  157.  *     - preload local variables
  158.  *     - add output and input regs (order of former determines numerical
  159.  *        mapping of latter)
  160.  *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
  161.  *     - remove "$" from addressing of Shift and Mask variables [20000823]
  162.  *
  163.  * 20000731:
  164.  *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
  165.  *
  166.  * 20000822:
  167.  *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
  168.  *     shared-library (-fPIC) version!  Code works just fine as part of static
  169.  *     library.  Damn damn damn damn damn, should have tested that sooner.
  170.  *     ebx is getting clobbered again (explicitly this time); need to save it
  171.  *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
  172.  *
  173.  * 20000823:
  174.  *  - first section was trickiest; all remaining sections have ebx -> edx now.
  175.  *     (-fPIC works again.)  Also added missing underscores to various Shift*
  176.  *     and *Mask* globals and got rid of leading "$" signs.
  177.  *
  178.  * 20000826:
  179.  *  - added visual separators to help navigate microscopic printed copies
  180.  *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
  181.  *     on png_read_filter_row_mmx_avg()
  182.  *
  183.  * 20000828:
  184.  *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
  185.  *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
  186.  *     cleaned up/shortened in either routine, but functionality is complete
  187.  *     and seems to be working fine.
  188.  *
  189.  * 20000829:
  190.  *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
  191.  *     as an input reg (with dummy output variables, etc.), then it *cannot*
  192.  *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
  193.  *     is simple enough...
  194.  *
  195.  * 20000914:
  196.  *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
  197.  *     correctly (but 48-bit RGB just fine)
  198.  *
  199.  * 20000916:
  200.  *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
  201.  *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
  202.  *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
  203.  *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
  204.  *
  205.  * 20010101:
  206.  *  - added new png_init_mmx_flags() function (here only because it needs to
  207.  *     call mmxsupport(), which should probably become global png_mmxsupport());
  208.  *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
  209.  *
  210.  * 20010103:
  211.  *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
  212.  *     and made it public; moved png_init_mmx_flags() to png.c as internal func
  213.  *
  214.  * 20010104:
  215.  *  - removed dependency on png_read_filter_row_c() (C code already duplicated
  216.  *     within MMX version of png_read_filter_row()) so no longer necessary to
  217.  *     compile it into pngrutil.o
  218.  *
  219.  * 20010310:
  220.  *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
  221.  *
  222.  * 20020304:
  223.  *  - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
  224.  *
  225.  * 20040724:
  226.  *   - more tinkering with clobber list at lines 4529 and 5033, to get
  227.  *     it to compile on gcc-3.4.
  228.  *
  229.  * STILL TO DO:
  230.  *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
  231.  *     - write MMX code for 48-bit case (pixel_bytes == 6)
  232.  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
  233.  *        why subtract 8 from width_mmx in the pass 4/5 case?
  234.  *        (only width_mmx case) (near line 1606)
  235.  *     - rewrite all MMX interlacing code so it's aligned with beginning
  236.  *        of the row buffer, not the end (see 19991007 for details)
  237.  *     x pick one version of mmxsupport() and get rid of the other
  238.  *     - add error messages to any remaining bogus default cases
  239.  *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
  240.  *     x add support for runtime enable/disable/query of various MMX routines
  241.  */
  242. #define PNG_INTERNAL
  243. #include "png.h"
  244. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
  245. int PNGAPI png_mmx_support(void);
  246. #ifdef PNG_USE_LOCAL_ARRAYS
  247. static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
  248. static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
  249. static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
  250. #endif
  251. #if defined(PNG_MMX_CODE_SUPPORTED)
  252. /* djgpp, Win32, Cygwin, and OS2 add their own underscores to global variables,
  253.  * so define them without: */
  254. #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || 
  255.     defined(__OS2__)
  256. #  define _mmx_supported  mmx_supported
  257. #  define _const4         const4
  258. #  define _const6         const6
  259. #  define _mask8_0        mask8_0
  260. #  define _mask16_1       mask16_1
  261. #  define _mask16_0       mask16_0
  262. #  define _mask24_2       mask24_2
  263. #  define _mask24_1       mask24_1
  264. #  define _mask24_0       mask24_0
  265. #  define _mask32_3       mask32_3
  266. #  define _mask32_2       mask32_2
  267. #  define _mask32_1       mask32_1
  268. #  define _mask32_0       mask32_0
  269. #  define _mask48_5       mask48_5
  270. #  define _mask48_4       mask48_4
  271. #  define _mask48_3       mask48_3
  272. #  define _mask48_2       mask48_2
  273. #  define _mask48_1       mask48_1
  274. #  define _mask48_0       mask48_0
  275. #  define _LBCarryMask    LBCarryMask
  276. #  define _HBClearMask    HBClearMask
  277. #  define _ActiveMask     ActiveMask
  278. #  define _ActiveMask2    ActiveMask2
  279. #  define _ActiveMaskEnd  ActiveMaskEnd
  280. #  define _ShiftBpp       ShiftBpp
  281. #  define _ShiftRem       ShiftRem
  282. #ifdef PNG_THREAD_UNSAFE_OK
  283. #  define _unmask         unmask
  284. #  define _FullLength     FullLength
  285. #  define _MMXLength      MMXLength
  286. #  define _dif            dif
  287. #  define _patemp         patemp
  288. #  define _pbtemp         pbtemp
  289. #  define _pctemp         pctemp
  290. #endif
  291. #endif
  292. /* These constants are used in the inlined MMX assembly code.
  293.    Ignore gcc's "At top level: defined but not used" warnings. */
  294. /* GRR 20000706:  originally _unmask was needed only when compiling with -fPIC,
  295.  *  since that case uses the %ebx register for indexing the Global Offset Table
  296.  *  and there were no other registers available.  But gcc 2.95 and later emit
  297.  *  "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
  298.  *  in the non-PIC case, so we'll just use the global unconditionally now.
  299.  */
  300. #ifdef PNG_THREAD_UNSAFE_OK
  301. static int _unmask;
  302. #endif
  303. static unsigned long long _mask8_0  = 0x0102040810204080LL;
  304. static unsigned long long _mask16_1 = 0x0101020204040808LL;
  305. static unsigned long long _mask16_0 = 0x1010202040408080LL;
  306. static unsigned long long _mask24_2 = 0x0101010202020404LL;
  307. static unsigned long long _mask24_1 = 0x0408080810101020LL;
  308. static unsigned long long _mask24_0 = 0x2020404040808080LL;
  309. static unsigned long long _mask32_3 = 0x0101010102020202LL;
  310. static unsigned long long _mask32_2 = 0x0404040408080808LL;
  311. static unsigned long long _mask32_1 = 0x1010101020202020LL;
  312. static unsigned long long _mask32_0 = 0x4040404080808080LL;
  313. static unsigned long long _mask48_5 = 0x0101010101010202LL;
  314. static unsigned long long _mask48_4 = 0x0202020204040404LL;
  315. static unsigned long long _mask48_3 = 0x0404080808080808LL;
  316. static unsigned long long _mask48_2 = 0x1010101010102020LL;
  317. static unsigned long long _mask48_1 = 0x2020202040404040LL;
  318. static unsigned long long _mask48_0 = 0x4040808080808080LL;
  319. static unsigned long long _const4   = 0x0000000000FFFFFFLL;
  320. //static unsigned long long _const5 = 0x000000FFFFFF0000LL;     // NOT USED
  321. static unsigned long long _const6   = 0x00000000000000FFLL;
  322. // These are used in the row-filter routines and should/would be local
  323. //  variables if not for gcc addressing limitations.
  324. // WARNING: Their presence probably defeats the thread safety of libpng.
  325. #ifdef PNG_THREAD_UNSAFE_OK
  326. static png_uint_32  _FullLength;
  327. static png_uint_32  _MMXLength;
  328. static int          _dif;
  329. static int          _patemp; // temp variables for Paeth routine
  330. static int          _pbtemp;
  331. static int          _pctemp;
  332. #endif
  333. void /* PRIVATE */
  334. png_squelch_warnings(void)
  335. {
  336. #ifdef PNG_THREAD_UNSAFE_OK
  337.    _dif = _dif;
  338.    _patemp = _patemp;
  339.    _pbtemp = _pbtemp;
  340.    _pctemp = _pctemp;
  341.    _MMXLength = _MMXLength;
  342. #endif
  343.    _const4  = _const4;
  344.    _const6  = _const6;
  345.    _mask8_0  = _mask8_0;
  346.    _mask16_1 = _mask16_1;
  347.    _mask16_0 = _mask16_0;
  348.    _mask24_2 = _mask24_2;
  349.    _mask24_1 = _mask24_1;
  350.    _mask24_0 = _mask24_0;
  351.    _mask32_3 = _mask32_3;
  352.    _mask32_2 = _mask32_2;
  353.    _mask32_1 = _mask32_1;
  354.    _mask32_0 = _mask32_0;
  355.    _mask48_5 = _mask48_5;
  356.    _mask48_4 = _mask48_4;
  357.    _mask48_3 = _mask48_3;
  358.    _mask48_2 = _mask48_2;
  359.    _mask48_1 = _mask48_1;
  360.    _mask48_0 = _mask48_0;
  361. }
  362. #endif /* PNG_MMX_CODE_SUPPORTED */
  363. static int _mmx_supported = 2;
  364. /*===========================================================================*/
  365. /*                                                                           */
  366. /*                       P N G _ C O M B I N E _ R O W                       */
  367. /*                                                                           */
  368. /*===========================================================================*/
  369. #if defined(PNG_HAVE_MMX_COMBINE_ROW)
  370. #define BPP2  2
  371. #define BPP3  3 /* bytes per pixel (a.k.a. pixel_bytes) */
  372. #define BPP4  4
  373. #define BPP6  6 /* (defined only to help avoid cut-and-paste errors) */
  374. #define BPP8  8
  375. /* Combines the row recently read in with the previous row.
  376.    This routine takes care of alpha and transparency if requested.
  377.    This routine also handles the two methods of progressive display
  378.    of interlaced images, depending on the mask value.
  379.    The mask value describes which pixels are to be combined with
  380.    the row.  The pattern always repeats every 8 pixels, so just 8
  381.    bits are needed.  A one indicates the pixel is to be combined; a
  382.    zero indicates the pixel is to be skipped.  This is in addition
  383.    to any alpha or transparency value associated with the pixel.
  384.    If you want all pixels to be combined, pass 0xff (255) in mask. */
  385. /* Use this routine for the x86 platform - it uses a faster MMX routine
  386.    if the machine supports MMX. */
  387. void /* PRIVATE */
  388. png_combine_row(png_structp png_ptr, png_bytep row, int mask)
  389. {
  390.    png_debug(1, "in png_combine_row (pnggccrd.c)n");
  391. #if defined(PNG_MMX_CODE_SUPPORTED)
  392.    if (_mmx_supported == 2) {
  393. #if !defined(PNG_1_0_X)
  394.        /* this should have happened in png_init_mmx_flags() already */
  395.        png_warning(png_ptr, "asm_flags may not have been initialized");
  396. #endif
  397.        png_mmx_support();
  398.    }
  399. #endif
  400.    if (mask == 0xff)
  401.    {
  402.       png_debug(2,"mask == 0xff:  doing single png_memcpy()n");
  403.       png_memcpy(row, png_ptr->row_buf + 1,
  404.        (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
  405.    }
  406.    else   /* (png_combine_row() is never called with mask == 0) */
  407.    {
  408.       switch (png_ptr->row_info.pixel_depth)
  409.       {
  410.          case 1:        /* png_ptr->row_info.pixel_depth */
  411.          {
  412.             png_bytep sp;
  413.             png_bytep dp;
  414.             int s_inc, s_start, s_end;
  415.             int m;
  416.             int shift;
  417.             png_uint_32 i;
  418.             sp = png_ptr->row_buf + 1;
  419.             dp = row;
  420.             m = 0x80;
  421. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  422.             if (png_ptr->transformations & PNG_PACKSWAP)
  423.             {
  424.                 s_start = 0;
  425.                 s_end = 7;
  426.                 s_inc = 1;
  427.             }
  428.             else
  429. #endif
  430.             {
  431.                 s_start = 7;
  432.                 s_end = 0;
  433.                 s_inc = -1;
  434.             }
  435.             shift = s_start;
  436.             for (i = 0; i < png_ptr->width; i++)
  437.             {
  438.                if (m & mask)
  439.                {
  440.                   int value;
  441.                   value = (*sp >> shift) & 0x1;
  442.                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
  443.                   *dp |= (png_byte)(value << shift);
  444.                }
  445.                if (shift == s_end)
  446.                {
  447.                   shift = s_start;
  448.                   sp++;
  449.                   dp++;
  450.                }
  451.                else
  452.                   shift += s_inc;
  453.                if (m == 1)
  454.                   m = 0x80;
  455.                else
  456.                   m >>= 1;
  457.             }
  458.             break;
  459.          }
  460.          case 2:        /* png_ptr->row_info.pixel_depth */
  461.          {
  462.             png_bytep sp;
  463.             png_bytep dp;
  464.             int s_start, s_end, s_inc;
  465.             int m;
  466.             int shift;
  467.             png_uint_32 i;
  468.             int value;
  469.             sp = png_ptr->row_buf + 1;
  470.             dp = row;
  471.             m = 0x80;
  472. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  473.             if (png_ptr->transformations & PNG_PACKSWAP)
  474.             {
  475.                s_start = 0;
  476.                s_end = 6;
  477.                s_inc = 2;
  478.             }
  479.             else
  480. #endif
  481.             {
  482.                s_start = 6;
  483.                s_end = 0;
  484.                s_inc = -2;
  485.             }
  486.             shift = s_start;
  487.             for (i = 0; i < png_ptr->width; i++)
  488.             {
  489.                if (m & mask)
  490.                {
  491.                   value = (*sp >> shift) & 0x3;
  492.                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
  493.                   *dp |= (png_byte)(value << shift);
  494.                }
  495.                if (shift == s_end)
  496.                {
  497.                   shift = s_start;
  498.                   sp++;
  499.                   dp++;
  500.                }
  501.                else
  502.                   shift += s_inc;
  503.                if (m == 1)
  504.                   m = 0x80;
  505.                else
  506.                   m >>= 1;
  507.             }
  508.             break;
  509.          }
  510.          case 4:        /* png_ptr->row_info.pixel_depth */
  511.          {
  512.             png_bytep sp;
  513.             png_bytep dp;
  514.             int s_start, s_end, s_inc;
  515.             int m;
  516.             int shift;
  517.             png_uint_32 i;
  518.             int value;
  519.             sp = png_ptr->row_buf + 1;
  520.             dp = row;
  521.             m = 0x80;
  522. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  523.             if (png_ptr->transformations & PNG_PACKSWAP)
  524.             {
  525.                s_start = 0;
  526.                s_end = 4;
  527.                s_inc = 4;
  528.             }
  529.             else
  530. #endif
  531.             {
  532.                s_start = 4;
  533.                s_end = 0;
  534.                s_inc = -4;
  535.             }
  536.             shift = s_start;
  537.             for (i = 0; i < png_ptr->width; i++)
  538.             {
  539.                if (m & mask)
  540.                {
  541.                   value = (*sp >> shift) & 0xf;
  542.                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
  543.                   *dp |= (png_byte)(value << shift);
  544.                }
  545.                if (shift == s_end)
  546.                {
  547.                   shift = s_start;
  548.                   sp++;
  549.                   dp++;
  550.                }
  551.                else
  552.                   shift += s_inc;
  553.                if (m == 1)
  554.                   m = 0x80;
  555.                else
  556.                   m >>= 1;
  557.             }
  558.             break;
  559.          }
  560.          case 8:        /* png_ptr->row_info.pixel_depth */
  561.          {
  562.             png_bytep srcptr;
  563.             png_bytep dstptr;
  564. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  565. #if !defined(PNG_1_0_X)
  566.             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  567.                 /* && _mmx_supported */ )
  568. #else
  569.             if (_mmx_supported)
  570. #endif
  571.             {
  572.                png_uint_32 len;
  573.                int diff;
  574.                int dummy_value_a;   // fix 'forbidden register spilled' error
  575.                int dummy_value_d;
  576.                int dummy_value_c;
  577.                int dummy_value_S;
  578.                int dummy_value_D;
  579.                _unmask = ~mask;            // global variable for -fPIC version
  580.                srcptr = png_ptr->row_buf + 1;
  581.                dstptr = row;
  582.                len  = png_ptr->width &~7;  // reduce to multiple of 8
  583.                diff = (int) (png_ptr->width & 7);  // amount lost
  584.                __asm__ __volatile__ (
  585.                   "movd      _unmask, %%mm7  nt" // load bit pattern
  586.                   "psubb     %%mm6, %%mm6    nt" // zero mm6
  587.                   "punpcklbw %%mm7, %%mm7    nt"
  588.                   "punpcklwd %%mm7, %%mm7    nt"
  589.                   "punpckldq %%mm7, %%mm7    nt" // fill reg with 8 masks
  590.                   "movq      _mask8_0, %%mm0 nt"
  591.                   "pand      %%mm7, %%mm0    nt" // nonzero if keep byte
  592.                   "pcmpeqb   %%mm6, %%mm0    nt" // zeros->1s, v versa
  593. // preload        "movl      len, %%ecx      nt" // load length of line
  594. // preload        "movl      srcptr, %%esi   nt" // load source
  595. // preload        "movl      dstptr, %%edi   nt" // load dest
  596.                   "cmpl      $0, %%ecx       nt" // len == 0 ?
  597.                   "je        mainloop8end    nt"
  598.                 "mainloop8:                  nt"
  599.                   "movq      (%%esi), %%mm4  nt" // *srcptr
  600.                   "pand      %%mm0, %%mm4    nt"
  601.                   "movq      %%mm0, %%mm6    nt"
  602.                   "pandn     (%%edi), %%mm6  nt" // *dstptr
  603.                   "por       %%mm6, %%mm4    nt"
  604.                   "movq      %%mm4, (%%edi)  nt"
  605.                   "addl      $8, %%esi       nt" // inc by 8 bytes processed
  606.                   "addl      $8, %%edi       nt"
  607.                   "subl      $8, %%ecx       nt" // dec by 8 pixels processed
  608.                   "ja        mainloop8       nt"
  609.                 "mainloop8end:               nt"
  610. // preload        "movl      diff, %%ecx     nt" // (diff is in eax)
  611.                   "movl      %%eax, %%ecx    nt"
  612.                   "cmpl      $0, %%ecx       nt"
  613.                   "jz        end8            nt"
  614. // preload        "movl      mask, %%edx     nt"
  615.                   "sall      $24, %%edx      nt" // make low byte, high byte
  616.                 "secondloop8:                nt"
  617.                   "sall      %%edx           nt" // move high bit to CF
  618.                   "jnc       skip8           nt" // if CF = 0
  619.                   "movb      (%%esi), %%al   nt"
  620.                   "movb      %%al, (%%edi)   nt"
  621.                 "skip8:                      nt"
  622.                   "incl      %%esi           nt"
  623.                   "incl      %%edi           nt"
  624.                   "decl      %%ecx           nt"
  625.                   "jnz       secondloop8     nt"
  626.                 "end8:                       nt"
  627.                   "EMMS                      nt"  // DONE
  628.                   : "=a" (dummy_value_a),           // output regs (dummy)
  629.                     "=d" (dummy_value_d),
  630.                     "=c" (dummy_value_c),
  631.                     "=S" (dummy_value_S),
  632.                     "=D" (dummy_value_D)
  633.                   : "3" (srcptr),      // esi       // input regs
  634.                     "4" (dstptr),      // edi
  635.                     "0" (diff),        // eax
  636. // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
  637.                     "2" (len),         // ecx
  638.                     "1" (mask)         // edx
  639. #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  640.                   : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
  641. #endif
  642.                );
  643.             }
  644.             else /* mmx _not supported - Use modified C routine */
  645. #endif /* PNG_MMX_CODE_SUPPORTED */
  646.             {
  647.                register png_uint_32 i;
  648.                png_uint_32 initial_val = png_pass_start[png_ptr->pass];
  649.                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  650.                register int stride = png_pass_inc[png_ptr->pass];
  651.                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  652.                register int rep_bytes = png_pass_width[png_ptr->pass];
  653.                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  654.                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
  655.                int diff = (int) (png_ptr->width & 7); /* amount lost */
  656.                register png_uint_32 final_val = len;  /* GRR bugfix */
  657.                srcptr = png_ptr->row_buf + 1 + initial_val;
  658.                dstptr = row + initial_val;
  659.                for (i = initial_val; i < final_val; i += stride)
  660.                {
  661.                   png_memcpy(dstptr, srcptr, rep_bytes);
  662.                   srcptr += stride;
  663.                   dstptr += stride;
  664.                }
  665.                if (diff)  /* number of leftover pixels:  3 for pngtest */
  666.                {
  667.                   final_val+=diff /* *BPP1 */ ;
  668.                   for (; i < final_val; i += stride)
  669.                   {
  670.                      if (rep_bytes > (int)(final_val-i))
  671.                         rep_bytes = (int)(final_val-i);
  672.                      png_memcpy(dstptr, srcptr, rep_bytes);
  673.                      srcptr += stride;
  674.                      dstptr += stride;
  675.                   }
  676.                }
  677.             } /* end of else (_mmx_supported) */
  678.             break;
  679.          }       /* end 8 bpp */
  680.          case 16:       /* png_ptr->row_info.pixel_depth */
  681.          {
  682.             png_bytep srcptr;
  683.             png_bytep dstptr;
  684. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  685. #if !defined(PNG_1_0_X)
  686.             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  687.                 /* && _mmx_supported */ )
  688. #else
  689.             if (_mmx_supported)
  690. #endif
  691.             {
  692.                png_uint_32 len;
  693.                int diff;
  694.                int dummy_value_a;   // fix 'forbidden register spilled' error
  695.                int dummy_value_d;
  696.                int dummy_value_c;
  697.                int dummy_value_S;
  698.                int dummy_value_D;
  699.                _unmask = ~mask;            // global variable for -fPIC version
  700.                srcptr = png_ptr->row_buf + 1;
  701.                dstptr = row;
  702.                len  = png_ptr->width &~7;  // reduce to multiple of 8
  703.                diff = (int) (png_ptr->width & 7); // amount lost //
  704.                __asm__ __volatile__ (
  705.                   "movd      _unmask, %%mm7   nt" // load bit pattern
  706.                   "psubb     %%mm6, %%mm6     nt" // zero mm6
  707.                   "punpcklbw %%mm7, %%mm7     nt"
  708.                   "punpcklwd %%mm7, %%mm7     nt"
  709.                   "punpckldq %%mm7, %%mm7     nt" // fill reg with 8 masks
  710.                   "movq      _mask16_0, %%mm0 nt"
  711.                   "movq      _mask16_1, %%mm1 nt"
  712.                   "pand      %%mm7, %%mm0     nt"
  713.                   "pand      %%mm7, %%mm1     nt"
  714.                   "pcmpeqb   %%mm6, %%mm0     nt"
  715.                   "pcmpeqb   %%mm6, %%mm1     nt"
  716. // preload        "movl      len, %%ecx       nt" // load length of line
  717. // preload        "movl      srcptr, %%esi    nt" // load source
  718. // preload        "movl      dstptr, %%edi    nt" // load dest
  719.                   "cmpl      $0, %%ecx        nt"
  720.                   "jz        mainloop16end    nt"
  721.                 "mainloop16:                  nt"
  722.                   "movq      (%%esi), %%mm4   nt"
  723.                   "pand      %%mm0, %%mm4     nt"
  724.                   "movq      %%mm0, %%mm6     nt"
  725.                   "movq      (%%edi), %%mm7   nt"
  726.                   "pandn     %%mm7, %%mm6     nt"
  727.                   "por       %%mm6, %%mm4     nt"
  728.                   "movq      %%mm4, (%%edi)   nt"
  729.                   "movq      8(%%esi), %%mm5  nt"
  730.                   "pand      %%mm1, %%mm5     nt"
  731.                   "movq      %%mm1, %%mm7     nt"
  732.                   "movq      8(%%edi), %%mm6  nt"
  733.                   "pandn     %%mm6, %%mm7     nt"
  734.                   "por       %%mm7, %%mm5     nt"
  735.                   "movq      %%mm5, 8(%%edi)  nt"
  736.                   "addl      $16, %%esi       nt" // inc by 16 bytes processed
  737.                   "addl      $16, %%edi       nt"
  738.                   "subl      $8, %%ecx        nt" // dec by 8 pixels processed
  739.                   "ja        mainloop16       nt"
  740.                 "mainloop16end:               nt"
  741. // preload        "movl      diff, %%ecx      nt" // (diff is in eax)
  742.                   "movl      %%eax, %%ecx     nt"
  743.                   "cmpl      $0, %%ecx        nt"
  744.                   "jz        end16            nt"
  745. // preload        "movl      mask, %%edx      nt"
  746.                   "sall      $24, %%edx       nt" // make low byte, high byte
  747.                 "secondloop16:                nt"
  748.                   "sall      %%edx            nt" // move high bit to CF
  749.                   "jnc       skip16           nt" // if CF = 0
  750.                   "movw      (%%esi), %%ax    nt"
  751.                   "movw      %%ax, (%%edi)    nt"
  752.                 "skip16:                      nt"
  753.                   "addl      $2, %%esi        nt"
  754.                   "addl      $2, %%edi        nt"
  755.                   "decl      %%ecx            nt"
  756.                   "jnz       secondloop16     nt"
  757.                 "end16:                       nt"
  758.                   "EMMS                       nt" // DONE
  759.                   : "=a" (dummy_value_a),           // output regs (dummy)
  760.                     "=c" (dummy_value_c),
  761.                     "=d" (dummy_value_d),
  762.                     "=S" (dummy_value_S),
  763.                     "=D" (dummy_value_D)
  764.                   : "0" (diff),        // eax       // input regs
  765. // was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx
  766.                     "1" (len),         // ecx
  767.                     "2" (mask),        // edx
  768.                     "3" (srcptr),      // esi
  769.                     "4" (dstptr)       // edi
  770. #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  771.                   : "%mm0", "%mm1", "%mm4"          // clobber list
  772.                   , "%mm5", "%mm6", "%mm7"
  773. #endif
  774.                );
  775.             }
  776.             else /* mmx _not supported - Use modified C routine */
  777. #endif /* PNG_MMX_CODE_SUPPORTED */
  778.             {
  779.                register png_uint_32 i;
  780.                png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
  781.                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  782.                register int stride = BPP2 * png_pass_inc[png_ptr->pass];
  783.                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  784.                register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
  785.                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  786.                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
  787.                int diff = (int) (png_ptr->width & 7); /* amount lost */
  788.                register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
  789.                srcptr = png_ptr->row_buf + 1 + initial_val;
  790.                dstptr = row + initial_val;
  791.                for (i = initial_val; i < final_val; i += stride)
  792.                {
  793.                   png_memcpy(dstptr, srcptr, rep_bytes);
  794.                   srcptr += stride;
  795.                   dstptr += stride;
  796.                }
  797.                if (diff)  /* number of leftover pixels:  3 for pngtest */
  798.                {
  799.                   final_val+=diff*BPP2;
  800.                   for (; i < final_val; i += stride)
  801.                   {
  802.                      if (rep_bytes > (int)(final_val-i))
  803.                         rep_bytes = (int)(final_val-i);
  804.                      png_memcpy(dstptr, srcptr, rep_bytes);
  805.                      srcptr += stride;
  806.                      dstptr += stride;
  807.                   }
  808.                }
  809.             } /* end of else (_mmx_supported) */
  810.             break;
  811.          }       /* end 16 bpp */
  812.          case 24:       /* png_ptr->row_info.pixel_depth */
  813.          {
  814.             png_bytep srcptr;
  815.             png_bytep dstptr;
  816. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  817. #if !defined(PNG_1_0_X)
  818.             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  819.                 /* && _mmx_supported */ )
  820. #else
  821.             if (_mmx_supported)
  822. #endif
  823.             {
  824.                png_uint_32 len;
  825.                int diff;
  826.                int dummy_value_a;   // fix 'forbidden register spilled' error
  827.                int dummy_value_d;
  828.                int dummy_value_c;
  829.                int dummy_value_S;
  830.                int dummy_value_D;
  831.                _unmask = ~mask;            // global variable for -fPIC version
  832.                srcptr = png_ptr->row_buf + 1;
  833.                dstptr = row;
  834.                len  = png_ptr->width &~7;  // reduce to multiple of 8
  835.                diff = (int) (png_ptr->width & 7); // amount lost //
  836.                __asm__ __volatile__ (
  837.                   "movd      _unmask, %%mm7   nt" // load bit pattern
  838.                   "psubb     %%mm6, %%mm6     nt" // zero mm6
  839.                   "punpcklbw %%mm7, %%mm7     nt"
  840.                   "punpcklwd %%mm7, %%mm7     nt"
  841.                   "punpckldq %%mm7, %%mm7     nt" // fill reg with 8 masks
  842.                   "movq      _mask24_0, %%mm0 nt"
  843.                   "movq      _mask24_1, %%mm1 nt"
  844.                   "movq      _mask24_2, %%mm2 nt"
  845.                   "pand      %%mm7, %%mm0     nt"
  846.                   "pand      %%mm7, %%mm1     nt"
  847.                   "pand      %%mm7, %%mm2     nt"
  848.                   "pcmpeqb   %%mm6, %%mm0     nt"
  849.                   "pcmpeqb   %%mm6, %%mm1     nt"
  850.                   "pcmpeqb   %%mm6, %%mm2     nt"
  851. // preload        "movl      len, %%ecx       nt" // load length of line
  852. // preload        "movl      srcptr, %%esi    nt" // load source
  853. // preload        "movl      dstptr, %%edi    nt" // load dest
  854.                   "cmpl      $0, %%ecx        nt"
  855.                   "jz        mainloop24end    nt"
  856.                 "mainloop24:                  nt"
  857.                   "movq      (%%esi), %%mm4   nt"
  858.                   "pand      %%mm0, %%mm4     nt"
  859.                   "movq      %%mm0, %%mm6     nt"
  860.                   "movq      (%%edi), %%mm7   nt"
  861.                   "pandn     %%mm7, %%mm6     nt"
  862.                   "por       %%mm6, %%mm4     nt"
  863.                   "movq      %%mm4, (%%edi)   nt"
  864.                   "movq      8(%%esi), %%mm5  nt"
  865.                   "pand      %%mm1, %%mm5     nt"
  866.                   "movq      %%mm1, %%mm7     nt"
  867.                   "movq      8(%%edi), %%mm6  nt"
  868.                   "pandn     %%mm6, %%mm7     nt"
  869.                   "por       %%mm7, %%mm5     nt"
  870.                   "movq      %%mm5, 8(%%edi)  nt"
  871.                   "movq      16(%%esi), %%mm6 nt"
  872.                   "pand      %%mm2, %%mm6     nt"
  873.                   "movq      %%mm2, %%mm4     nt"
  874.                   "movq      16(%%edi), %%mm7 nt"
  875.                   "pandn     %%mm7, %%mm4     nt"
  876.                   "por       %%mm4, %%mm6     nt"
  877.                   "movq      %%mm6, 16(%%edi) nt"
  878.                   "addl      $24, %%esi       nt" // inc by 24 bytes processed
  879.                   "addl      $24, %%edi       nt"
  880.                   "subl      $8, %%ecx        nt" // dec by 8 pixels processed
  881.                   "ja        mainloop24       nt"
  882.                 "mainloop24end:               nt"
  883. // preload        "movl      diff, %%ecx      nt" // (diff is in eax)
  884.                   "movl      %%eax, %%ecx     nt"
  885.                   "cmpl      $0, %%ecx        nt"
  886.                   "jz        end24            nt"
  887. // preload        "movl      mask, %%edx      nt"
  888.                   "sall      $24, %%edx       nt" // make low byte, high byte
  889.                 "secondloop24:                nt"
  890.                   "sall      %%edx            nt" // move high bit to CF
  891.                   "jnc       skip24           nt" // if CF = 0
  892.                   "movw      (%%esi), %%ax    nt"
  893.                   "movw      %%ax, (%%edi)    nt"
  894.                   "xorl      %%eax, %%eax     nt"
  895.                   "movb      2(%%esi), %%al   nt"
  896.                   "movb      %%al, 2(%%edi)   nt"
  897.                 "skip24:                      nt"
  898.                   "addl      $3, %%esi        nt"
  899.                   "addl      $3, %%edi        nt"
  900.                   "decl      %%ecx            nt"
  901.                   "jnz       secondloop24     nt"
  902.                 "end24:                       nt"
  903.                   "EMMS                       nt" // DONE
  904.                   : "=a" (dummy_value_a),           // output regs (dummy)
  905.                     "=d" (dummy_value_d),
  906.                     "=c" (dummy_value_c),
  907.                     "=S" (dummy_value_S),
  908.                     "=D" (dummy_value_D)
  909.                   : "3" (srcptr),      // esi       // input regs
  910.                     "4" (dstptr),      // edi
  911.                     "0" (diff),        // eax
  912. // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
  913.                     "2" (len),         // ecx
  914.                     "1" (mask)         // edx
  915. #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  916.                   : "%mm0", "%mm1", "%mm2"          // clobber list
  917.                   , "%mm4", "%mm5", "%mm6", "%mm7"
  918. #endif
  919.                );
  920.             }
  921.             else /* mmx _not supported - Use modified C routine */
  922. #endif /* PNG_MMX_CODE_SUPPORTED */
  923.             {
  924.                register png_uint_32 i;
  925.                png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
  926.                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  927.                register int stride = BPP3 * png_pass_inc[png_ptr->pass];
  928.                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  929.                register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
  930.                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  931.                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
  932.                int diff = (int) (png_ptr->width & 7); /* amount lost */
  933.                register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
  934.                srcptr = png_ptr->row_buf + 1 + initial_val;
  935.                dstptr = row + initial_val;
  936.                for (i = initial_val; i < final_val; i += stride)
  937.                {
  938.                   png_memcpy(dstptr, srcptr, rep_bytes);
  939.                   srcptr += stride;
  940.                   dstptr += stride;
  941.                }
  942.                if (diff)  /* number of leftover pixels:  3 for pngtest */
  943.                {
  944.                   final_val+=diff*BPP3;
  945.                   for (; i < final_val; i += stride)
  946.                   {
  947.                      if (rep_bytes > (int)(final_val-i))
  948.                         rep_bytes = (int)(final_val-i);
  949.                      png_memcpy(dstptr, srcptr, rep_bytes);
  950.                      srcptr += stride;
  951.                      dstptr += stride;
  952.                   }
  953.                }
  954.             } /* end of else (_mmx_supported) */
  955.             break;
  956.          }       /* end 24 bpp */
  957.          case 32:       /* png_ptr->row_info.pixel_depth */
  958.          {
  959.             png_bytep srcptr;
  960.             png_bytep dstptr;
  961. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  962. #if !defined(PNG_1_0_X)
  963.             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  964.                 /* && _mmx_supported */ )
  965. #else
  966.             if (_mmx_supported)
  967. #endif
  968.             {
  969.                png_uint_32 len;
  970.                int diff;
  971.                int dummy_value_a;   // fix 'forbidden register spilled' error
  972.                int dummy_value_d;
  973.                int dummy_value_c;
  974.                int dummy_value_S;
  975.                int dummy_value_D;
  976.                _unmask = ~mask;            // global variable for -fPIC version
  977.                srcptr = png_ptr->row_buf + 1;
  978.                dstptr = row;
  979.                len  = png_ptr->width &~7;  // reduce to multiple of 8
  980.                diff = (int) (png_ptr->width & 7); // amount lost //
  981.                __asm__ __volatile__ (
  982.                   "movd      _unmask, %%mm7   nt" // load bit pattern
  983.                   "psubb     %%mm6, %%mm6     nt" // zero mm6
  984.                   "punpcklbw %%mm7, %%mm7     nt"
  985.                   "punpcklwd %%mm7, %%mm7     nt"
  986.                   "punpckldq %%mm7, %%mm7     nt" // fill reg with 8 masks
  987.                   "movq      _mask32_0, %%mm0 nt"
  988.                   "movq      _mask32_1, %%mm1 nt"
  989.                   "movq      _mask32_2, %%mm2 nt"
  990.                   "movq      _mask32_3, %%mm3 nt"
  991.                   "pand      %%mm7, %%mm0     nt"
  992.                   "pand      %%mm7, %%mm1     nt"
  993.                   "pand      %%mm7, %%mm2     nt"
  994.                   "pand      %%mm7, %%mm3     nt"
  995.                   "pcmpeqb   %%mm6, %%mm0     nt"
  996.                   "pcmpeqb   %%mm6, %%mm1     nt"
  997.                   "pcmpeqb   %%mm6, %%mm2     nt"
  998.                   "pcmpeqb   %%mm6, %%mm3     nt"
  999. // preload        "movl      len, %%ecx       nt" // load length of line
  1000. // preload        "movl      srcptr, %%esi    nt" // load source
  1001. // preload        "movl      dstptr, %%edi    nt" // load dest
  1002.                   "cmpl      $0, %%ecx        nt" // lcr
  1003.                   "jz        mainloop32end    nt"
  1004.                 "mainloop32:                  nt"
  1005.                   "movq      (%%esi), %%mm4   nt"
  1006.                   "pand      %%mm0, %%mm4     nt"
  1007.                   "movq      %%mm0, %%mm6     nt"
  1008.                   "movq      (%%edi), %%mm7   nt"
  1009.                   "pandn     %%mm7, %%mm6     nt"
  1010.                   "por       %%mm6, %%mm4     nt"
  1011.                   "movq      %%mm4, (%%edi)   nt"
  1012.                   "movq      8(%%esi), %%mm5  nt"
  1013.                   "pand      %%mm1, %%mm5     nt"
  1014.                   "movq      %%mm1, %%mm7     nt"
  1015.                   "movq      8(%%edi), %%mm6  nt"
  1016.                   "pandn     %%mm6, %%mm7     nt"
  1017.                   "por       %%mm7, %%mm5     nt"
  1018.                   "movq      %%mm5, 8(%%edi)  nt"
  1019.                   "movq      16(%%esi), %%mm6 nt"
  1020.                   "pand      %%mm2, %%mm6     nt"
  1021.                   "movq      %%mm2, %%mm4     nt"
  1022.                   "movq      16(%%edi), %%mm7 nt"
  1023.                   "pandn     %%mm7, %%mm4     nt"
  1024.                   "por       %%mm4, %%mm6     nt"
  1025.                   "movq      %%mm6, 16(%%edi) nt"
  1026.                   "movq      24(%%esi), %%mm7 nt"
  1027.                   "pand      %%mm3, %%mm7     nt"
  1028.                   "movq      %%mm3, %%mm5     nt"
  1029.                   "movq      24(%%edi), %%mm4 nt"
  1030.                   "pandn     %%mm4, %%mm5     nt"
  1031.                   "por       %%mm5, %%mm7     nt"
  1032.                   "movq      %%mm7, 24(%%edi) nt"
  1033.                   "addl      $32, %%esi       nt" // inc by 32 bytes processed
  1034.                   "addl      $32, %%edi       nt"
  1035.                   "subl      $8, %%ecx        nt" // dec by 8 pixels processed
  1036.                   "ja        mainloop32       nt"
  1037.                 "mainloop32end:               nt"
  1038. // preload        "movl      diff, %%ecx      nt" // (diff is in eax)
  1039.                   "movl      %%eax, %%ecx     nt"
  1040.                   "cmpl      $0, %%ecx        nt"
  1041.                   "jz        end32            nt"
  1042. // preload        "movl      mask, %%edx      nt"
  1043.                   "sall      $24, %%edx       nt" // low byte => high byte
  1044.                 "secondloop32:                nt"
  1045.                   "sall      %%edx            nt" // move high bit to CF
  1046.                   "jnc       skip32           nt" // if CF = 0
  1047.                   "movl      (%%esi), %%eax   nt"
  1048.                   "movl      %%eax, (%%edi)   nt"
  1049.                 "skip32:                      nt"
  1050.                   "addl      $4, %%esi        nt"
  1051.                   "addl      $4, %%edi        nt"
  1052.                   "decl      %%ecx            nt"
  1053.                   "jnz       secondloop32     nt"
  1054.                 "end32:                       nt"
  1055.                   "EMMS                       nt" // DONE
  1056.                   : "=a" (dummy_value_a),           // output regs (dummy)
  1057.                     "=d" (dummy_value_d),
  1058.                     "=c" (dummy_value_c),
  1059.                     "=S" (dummy_value_S),
  1060.                     "=D" (dummy_value_D)
  1061.                   : "3" (srcptr),      // esi       // input regs
  1062.                     "4" (dstptr),      // edi
  1063.                     "0" (diff),        // eax
  1064. // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
  1065.                     "2" (len),         // ecx
  1066.                     "1" (mask)         // edx
  1067. #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  1068.                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
  1069.                   , "%mm4", "%mm5", "%mm6", "%mm7"
  1070. #endif
  1071.                );
  1072.             }
  1073.             else /* mmx _not supported - Use modified C routine */
  1074. #endif /* PNG_MMX_CODE_SUPPORTED */
  1075.             {
  1076.                register png_uint_32 i;
  1077.                png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
  1078.                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  1079.                register int stride = BPP4 * png_pass_inc[png_ptr->pass];
  1080.                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  1081.                register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
  1082.                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  1083.                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
  1084.                int diff = (int) (png_ptr->width & 7); /* amount lost */
  1085.                register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
  1086.                srcptr = png_ptr->row_buf + 1 + initial_val;
  1087.                dstptr = row + initial_val;
  1088.                for (i = initial_val; i < final_val; i += stride)
  1089.                {
  1090.                   png_memcpy(dstptr, srcptr, rep_bytes);
  1091.                   srcptr += stride;
  1092.                   dstptr += stride;
  1093.                }
  1094.                if (diff)  /* number of leftover pixels:  3 for pngtest */
  1095.                {
  1096.                   final_val+=diff*BPP4;
  1097.                   for (; i < final_val; i += stride)
  1098.                   {
  1099.                      if (rep_bytes > (int)(final_val-i))
  1100.                         rep_bytes = (int)(final_val-i);
  1101.                      png_memcpy(dstptr, srcptr, rep_bytes);
  1102.                      srcptr += stride;
  1103.                      dstptr += stride;
  1104.                   }
  1105.                }
  1106.             } /* end of else (_mmx_supported) */
  1107.             break;
  1108.          }       /* end 32 bpp */
  1109.          case 48:       /* png_ptr->row_info.pixel_depth */
  1110.          {
  1111.             png_bytep srcptr;
  1112.             png_bytep dstptr;
  1113. #if defined(PNG_MMX_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  1114. #if !defined(PNG_1_0_X)
  1115.             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  1116.                 /* && _mmx_supported */ )
  1117. #else
  1118.             if (_mmx_supported)
  1119. #endif
  1120.             {
  1121.                png_uint_32 len;
  1122.                int diff;
  1123.                int dummy_value_a;   // fix 'forbidden register spilled' error
  1124.                int dummy_value_d;
  1125.                int dummy_value_c;
  1126.                int dummy_value_S;
  1127.                int dummy_value_D;
  1128.                _unmask = ~mask;            // global variable for -fPIC version
  1129.                srcptr = png_ptr->row_buf + 1;
  1130.                dstptr = row;
  1131.                len  = png_ptr->width &~7;  // reduce to multiple of 8
  1132.                diff = (int) (png_ptr->width & 7); // amount lost //
  1133.                __asm__ __volatile__ (
  1134.                   "movd      _unmask, %%mm7   nt" // load bit pattern
  1135.                   "psubb     %%mm6, %%mm6     nt" // zero mm6
  1136.                   "punpcklbw %%mm7, %%mm7     nt"
  1137.                   "punpcklwd %%mm7, %%mm7     nt"
  1138.                   "punpckldq %%mm7, %%mm7     nt" // fill reg with 8 masks
  1139.                   "movq      _mask48_0, %%mm0 nt"
  1140.                   "movq      _mask48_1, %%mm1 nt"
  1141.                   "movq      _mask48_2, %%mm2 nt"
  1142.                   "movq      _mask48_3, %%mm3 nt"
  1143.                   "movq      _mask48_4, %%mm4 nt"
  1144.                   "movq      _mask48_5, %%mm5 nt"
  1145.                   "pand      %%mm7, %%mm0     nt"
  1146.                   "pand      %%mm7, %%mm1     nt"
  1147.                   "pand      %%mm7, %%mm2     nt"
  1148.                   "pand      %%mm7, %%mm3     nt"
  1149.                   "pand      %%mm7, %%mm4     nt"
  1150.                   "pand      %%mm7, %%mm5     nt"
  1151.                   "pcmpeqb   %%mm6, %%mm0     nt"
  1152.                   "pcmpeqb   %%mm6, %%mm1     nt"
  1153.                   "pcmpeqb   %%mm6, %%mm2     nt"
  1154.                   "pcmpeqb   %%mm6, %%mm3     nt"
  1155.                   "pcmpeqb   %%mm6, %%mm4     nt"
  1156.                   "pcmpeqb   %%mm6, %%mm5     nt"
  1157. // preload        "movl      len, %%ecx       nt" // load length of line
  1158. // preload        "movl      srcptr, %%esi    nt" // load source
  1159. // preload        "movl      dstptr, %%edi    nt" // load dest
  1160.                   "cmpl      $0, %%ecx        nt"
  1161.                   "jz        mainloop48end    nt"
  1162.                 "mainloop48:                  nt"
  1163.                   "movq      (%%esi), %%mm7   nt"
  1164.                   "pand      %%mm0, %%mm7     nt"
  1165.                   "movq      %%mm0, %%mm6     nt"
  1166.                   "pandn     (%%edi), %%mm6   nt"
  1167.                   "por       %%mm6, %%mm7     nt"
  1168.                   "movq      %%mm7, (%%edi)   nt"
  1169.                   "movq      8(%%esi), %%mm6  nt"
  1170.                   "pand      %%mm1, %%mm6     nt"
  1171.                   "movq      %%mm1, %%mm7     nt"
  1172.                   "pandn     8(%%edi), %%mm7  nt"
  1173.                   "por       %%mm7, %%mm6     nt"
  1174.                   "movq      %%mm6, 8(%%edi)  nt"
  1175.                   "movq      16(%%esi), %%mm6 nt"
  1176.                   "pand      %%mm2, %%mm6     nt"
  1177.                   "movq      %%mm2, %%mm7     nt"
  1178.                   "pandn     16(%%edi), %%mm7 nt"
  1179.                   "por       %%mm7, %%mm6     nt"
  1180.                   "movq      %%mm6, 16(%%edi) nt"
  1181.                   "movq      24(%%esi), %%mm7 nt"
  1182.                   "pand      %%mm3, %%mm7     nt"
  1183.                   "movq      %%mm3, %%mm6     nt"
  1184.                   "pandn     24(%%edi), %%mm6 nt"
  1185.                   "por       %%mm6, %%mm7     nt"
  1186.                   "movq      %%mm7, 24(%%edi) nt"
  1187.                   "movq      32(%%esi), %%mm6 nt"
  1188.                   "pand      %%mm4, %%mm6     nt"
  1189.                   "movq      %%mm4, %%mm7     nt"
  1190.                   "pandn     32(%%edi), %%mm7 nt"
  1191.                   "por       %%mm7, %%mm6     nt"
  1192.                   "movq      %%mm6, 32(%%edi) nt"
  1193.                   "movq      40(%%esi), %%mm7 nt"
  1194.                   "pand      %%mm5, %%mm7     nt"
  1195.                   "movq      %%mm5, %%mm6     nt"
  1196.                   "pandn     40(%%edi), %%mm6 nt"
  1197.                   "por       %%mm6, %%mm7     nt"
  1198.                   "movq      %%mm7, 40(%%edi) nt"
  1199.                   "addl      $48, %%esi       nt" // inc by 48 bytes processed
  1200.                   "addl      $48, %%edi       nt"
  1201.                   "subl      $8, %%ecx        nt" // dec by 8 pixels processed
  1202.                   "ja        mainloop48       nt"
  1203.                 "mainloop48end:               nt"
  1204. // preload        "movl      diff, %%ecx      nt" // (diff is in eax)
  1205.                   "movl      %%eax, %%ecx     nt"
  1206.                   "cmpl      $0, %%ecx        nt"
  1207.                   "jz        end48            nt"
  1208. // preload        "movl      mask, %%edx      nt"
  1209.                   "sall      $24, %%edx       nt" // make low byte, high byte
  1210.                 "secondloop48:                nt"
  1211.                   "sall      %%edx            nt" // move high bit to CF
  1212.                   "jnc       skip48           nt" // if CF = 0
  1213.                   "movl      (%%esi), %%eax   nt"
  1214.                   "movl      %%eax, (%%edi)   nt"
  1215.                 "skip48:                      nt"
  1216.                   "addl      $4, %%esi        nt"
  1217.                   "addl      $4, %%edi        nt"
  1218.                   "decl      %%ecx            nt"
  1219.                   "jnz       secondloop48     nt"
  1220.                 "end48:                       nt"
  1221.                   "EMMS                       nt" // DONE
  1222.                   : "=a" (dummy_value_a),           // output regs (dummy)
  1223.                     "=d" (dummy_value_d),
  1224.                     "=c" (dummy_value_c),
  1225.                     "=S" (dummy_value_S),
  1226.                     "=D" (dummy_value_D)
  1227.                   : "3" (srcptr),      // esi       // input regs
  1228.                     "4" (dstptr),      // edi
  1229.                     "0" (diff),        // eax
  1230. // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
  1231.                     "2" (len),         // ecx
  1232.                     "1" (mask)         // edx
  1233. #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  1234.                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
  1235.                   , "%mm4", "%mm5", "%mm6", "%mm7"
  1236. #endif
  1237.                );
  1238.             }
  1239.             else /* mmx _not supported - Use modified C routine */
  1240. #endif /* PNG_MMX_CODE_SUPPORTED */
  1241.             {
  1242.                register png_uint_32 i;
  1243.                png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
  1244.                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  1245.                register int stride = BPP6 * png_pass_inc[png_ptr->pass];
  1246.                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  1247.                register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
  1248.                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  1249.                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
  1250.                int diff = (int) (png_ptr->width & 7); /* amount lost */
  1251.                register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
  1252.                srcptr = png_ptr->row_buf + 1 + initial_val;
  1253.                dstptr = row + initial_val;
  1254.                for (i = initial_val; i < final_val; i += stride)
  1255.                {
  1256.                   png_memcpy(dstptr, srcptr, rep_bytes);
  1257.                   srcptr += stride;
  1258.                   dstptr += stride;
  1259.                }
  1260.                if (diff)  /* number of leftover pixels:  3 for pngtest */
  1261.                {
  1262.                   final_val+=diff*BPP6;
  1263.                   for (; i < final_val; i += stride)
  1264.                   {
  1265.                      if (rep_bytes > (int)(final_val-i))
  1266.                         rep_bytes = (int)(final_val-i);
  1267.                      png_memcpy(dstptr, srcptr, rep_bytes);
  1268.                      srcptr += stride;
  1269.                      dstptr += stride;
  1270.                   }
  1271.                }
  1272.             } /* end of else (_mmx_supported) */
  1273.             break;
  1274.          }       /* end 48 bpp */
  1275.          case 64:       /* png_ptr->row_info.pixel_depth */
  1276.          {
  1277.             png_bytep srcptr;
  1278.             png_bytep dstptr;
  1279.             register png_uint_32 i;
  1280.             png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
  1281.               /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  1282.             register int stride = BPP8 * png_pass_inc[png_ptr->pass];
  1283.               /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  1284.             register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
  1285.               /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  1286.             png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
  1287.             int diff = (int) (png_ptr->width & 7); /* amount lost */
  1288.             register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
  1289.             srcptr = png_ptr->row_buf + 1 + initial_val;
  1290.             dstptr = row + initial_val;
  1291.             for (i = initial_val; i < final_val; i += stride)
  1292.             {
  1293.                png_memcpy(dstptr, srcptr, rep_bytes);
  1294.                srcptr += stride;
  1295.                dstptr += stride;
  1296.             }
  1297.             if (diff)  /* number of leftover pixels:  3 for pngtest */
  1298.             {
  1299.                final_val+=diff*BPP8;
  1300.                for (; i < final_val; i += stride)
  1301.                {
  1302.                   if (rep_bytes > (int)(final_val-i))
  1303.                      rep_bytes = (int)(final_val-i);
  1304.                   png_memcpy(dstptr, srcptr, rep_bytes);
  1305.                   srcptr += stride;
  1306.                   dstptr += stride;
  1307.                }
  1308.             }
  1309.             break;
  1310.          }       /* end 64 bpp */
  1311.          default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
  1312.          {
  1313.             /* this should never happen */
  1314.             png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
  1315.             break;
  1316.          }
  1317.       } /* end switch (png_ptr->row_info.pixel_depth) */
  1318.    } /* end if (non-trivial mask) */
  1319. } /* end png_combine_row() */
  1320. #endif /* PNG_HAVE_MMX_COMBINE_ROW */
  1321. /*===========================================================================*/
  1322. /*                                                                           */
  1323. /*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
  1324. /*                                                                           */
  1325. /*===========================================================================*/
  1326. #if defined(PNG_READ_INTERLACING_SUPPORTED)
  1327. #if defined(PNG_HAVE_MMX_READ_INTERLACE)
  1328. /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
  1329.  * has taken place.  [GRR: what other steps come before and/or after?]
  1330.  */
  1331. void /* PRIVATE */
  1332. png_do_read_interlace(png_structp png_ptr)
  1333. {
  1334.    png_row_infop row_info = &(png_ptr->row_info);
  1335.    png_bytep row = png_ptr->row_buf + 1;
  1336.    int pass = png_ptr->pass;
  1337. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1338.    png_uint_32 transformations = png_ptr->transformations;
  1339. #endif
  1340.    png_debug(1, "in png_do_read_interlace (pnggccrd.c)n");
  1341. #if defined(PNG_MMX_CODE_SUPPORTED)
  1342.    if (_mmx_supported == 2) {
  1343. #if !defined(PNG_1_0_X)
  1344.        /* this should have happened in png_init_mmx_flags() already */
  1345.        png_warning(png_ptr, "asm_flags may not have been initialized");
  1346. #endif
  1347.        png_mmx_support();
  1348.    }
  1349. #endif
  1350.    if (row != NULL && row_info != NULL)
  1351.    {
  1352.       png_uint_32 final_width;
  1353.       final_width = row_info->width * png_pass_inc[pass];
  1354.       switch (row_info->pixel_depth)
  1355.       {
  1356.          case 1:
  1357.          {
  1358.             png_bytep sp, dp;
  1359.             int sshift, dshift;
  1360.             int s_start, s_end, s_inc;
  1361.             png_byte v;
  1362.             png_uint_32 i;
  1363.             int j;
  1364.             sp = row + (png_size_t)((row_info->width - 1) >> 3);
  1365.             dp = row + (png_size_t)((final_width - 1) >> 3);
  1366. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1367.             if (transformations & PNG_PACKSWAP)
  1368.             {
  1369.                sshift = (int)((row_info->width + 7) & 7);
  1370.                dshift = (int)((final_width + 7) & 7);
  1371.                s_start = 7;
  1372.                s_end = 0;
  1373.                s_inc = -1;
  1374.             }
  1375.             else
  1376. #endif
  1377.             {
  1378.                sshift = 7 - (int)((row_info->width + 7) & 7);
  1379.                dshift = 7 - (int)((final_width + 7) & 7);
  1380.                s_start = 0;
  1381.                s_end = 7;
  1382.                s_inc = 1;
  1383.             }
  1384.             for (i = row_info->width; i; i--)
  1385.             {
  1386.                v = (png_byte)((*sp >> sshift) & 0x1);
  1387.                for (j = 0; j < png_pass_inc[pass]; j++)
  1388.                {
  1389.                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
  1390.                   *dp |= (png_byte)(v << dshift);
  1391.                   if (dshift == s_end)
  1392.                   {
  1393.                      dshift = s_start;
  1394.                      dp--;
  1395.                   }
  1396.                   else
  1397.                      dshift += s_inc;
  1398.                }
  1399.                if (sshift == s_end)
  1400.                {
  1401.                   sshift = s_start;
  1402.                   sp--;
  1403.                }
  1404.                else
  1405.                   sshift += s_inc;
  1406.             }
  1407.             break;
  1408.          }
  1409.          case 2:
  1410.          {
  1411.             png_bytep sp, dp;
  1412.             int sshift, dshift;
  1413.             int s_start, s_end, s_inc;
  1414.             png_uint_32 i;
  1415.             sp = row + (png_size_t)((row_info->width - 1) >> 2);
  1416.             dp = row + (png_size_t)((final_width - 1) >> 2);
  1417. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1418.             if (transformations & PNG_PACKSWAP)
  1419.             {
  1420.                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
  1421.                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
  1422.                s_start = 6;
  1423.                s_end = 0;
  1424.                s_inc = -2;
  1425.             }
  1426.             else
  1427. #endif
  1428.             {
  1429.                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
  1430.                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
  1431.                s_start = 0;
  1432.                s_end = 6;
  1433.                s_inc = 2;
  1434.             }
  1435.             for (i = row_info->width; i; i--)
  1436.             {
  1437.                png_byte v;
  1438.                int j;
  1439.                v = (png_byte)((*sp >> sshift) & 0x3);
  1440.                for (j = 0; j < png_pass_inc[pass]; j++)
  1441.                {
  1442.                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
  1443.                   *dp |= (png_byte)(v << dshift);
  1444.                   if (dshift == s_end)
  1445.                   {
  1446.                      dshift = s_start;
  1447.                      dp--;
  1448.                   }
  1449.                   else
  1450.                      dshift += s_inc;
  1451.                }
  1452.                if (sshift == s_end)
  1453.                {
  1454.                   sshift = s_start;
  1455.                   sp--;
  1456.                }
  1457.                else
  1458.                   sshift += s_inc;
  1459.             }
  1460.             break;
  1461.          }
  1462.          case 4:
  1463.          {
  1464.             png_bytep sp, dp;
  1465.             int sshift, dshift;
  1466.             int s_start, s_end, s_inc;
  1467.             png_uint_32 i;
  1468.             sp = row + (png_size_t)((row_info->width - 1) >> 1);
  1469.             dp = row + (png_size_t)((final_width - 1) >> 1);
  1470. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1471.             if (transformations & PNG_PACKSWAP)
  1472.             {
  1473.                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
  1474.                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
  1475.                s_start = 4;
  1476.                s_end = 0;
  1477.                s_inc = -4;
  1478.             }
  1479.             else
  1480. #endif
  1481.             {
  1482.                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
  1483.                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
  1484.                s_start = 0;
  1485.                s_end = 4;
  1486.                s_inc = 4;
  1487.             }
  1488.             for (i = row_info->width; i; i--)
  1489.             {
  1490.                png_byte v;
  1491.                int j;
  1492.                v = (png_byte)((*sp >> sshift) & 0xf);
  1493.                for (j = 0; j < png_pass_inc[pass]; j++)
  1494.                {
  1495.                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
  1496.                   *dp |= (png_byte)(v << dshift);
  1497.                   if (dshift == s_end)
  1498.                   {
  1499.                      dshift = s_start;
  1500.                      dp--;
  1501.                   }
  1502.                   else
  1503.                      dshift += s_inc;
  1504.                }
  1505.                if (sshift == s_end)
  1506.                {
  1507.                   sshift = s_start;
  1508.                   sp--;
  1509.                }
  1510.                else
  1511.                   sshift += s_inc;
  1512.             }
  1513.             break;
  1514.          }
  1515.        /*====================================================================*/
  1516.          default: /* 8-bit or larger (this is where the routine is modified) */
  1517.          {
  1518. #if 0
  1519. //          static unsigned long long _const4 = 0x0000000000FFFFFFLL;  no good
  1520. //          static unsigned long long const4 = 0x0000000000FFFFFFLL;   no good
  1521. //          unsigned long long _const4 = 0x0000000000FFFFFFLL;         no good
  1522. //          unsigned long long const4 = 0x0000000000FFFFFFLL;          no good
  1523. #endif
  1524.             png_bytep sptr, dp;
  1525.             png_uint_32 i;
  1526.             png_size_t pixel_bytes;
  1527.             int width = (int)row_info->width;
  1528.             pixel_bytes = (row_info->pixel_depth >> 3);
  1529.             /* point sptr at the last pixel in the pre-expanded row: */
  1530.             sptr = row + (width - 1) * pixel_bytes;
  1531.             /* point dp at the last pixel position in the expanded row: */
  1532.             dp = row + (final_width - 1) * pixel_bytes;
  1533.             /* New code by Nirav Chhatrapati - Intel Corporation */
  1534. #if defined(PNG_MMX_CODE_SUPPORTED)
  1535. #if !defined(PNG_1_0_X)
  1536.             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
  1537.                 /* && _mmx_supported */ )
  1538. #else
  1539.             if (_mmx_supported)
  1540. #endif
  1541.             {
  1542.                //--------------------------------------------------------------
  1543.                if (pixel_bytes == 3)
  1544.                {
  1545.                   if (((pass == 0) || (pass == 1)) && width)
  1546.                   {
  1547.                      int dummy_value_c;   // fix 'forbidden register spilled'
  1548.                      int dummy_value_S;
  1549.                      int dummy_value_D;
  1550.                      int dummy_value_a;
  1551.                      __asm__ __volatile__ (
  1552.                         "subl $21, %%edi         nt"
  1553.                                      // (png_pass_inc[pass] - 1)*pixel_bytes
  1554.                      ".loop3_pass0:              nt"
  1555.                         "movd (%%esi), %%mm0     nt" // x x x x x 2 1 0
  1556.                         "pand (%3), %%mm0        nt" // z z z z z 2 1 0
  1557.                         "movq %%mm0, %%mm1       nt" // z z z z z 2 1 0
  1558.                         "psllq $16, %%mm0        nt" // z z z 2 1 0 z z
  1559.                         "movq %%mm0, %%mm2       nt" // z z z 2 1 0 z z
  1560.                         "psllq $24, %%mm0        nt" // 2 1 0 z z z z z
  1561.                         "psrlq $8, %%mm1         nt" // z z z z z z 2 1
  1562.                         "por %%mm2, %%mm0        nt" // 2 1 0 2 1 0 z z
  1563.                         "por %%mm1, %%mm0        nt" // 2 1 0 2 1 0 2 1
  1564.                         "movq %%mm0, %%mm3       nt" // 2 1 0 2 1 0 2 1
  1565.                         "psllq $16, %%mm0        nt" // 0 2 1 0 2 1 z z
  1566.                         "movq %%mm3, %%mm4       nt" // 2 1 0 2 1 0 2 1
  1567.                         "punpckhdq %%mm0, %%mm3  nt" // 0 2 1 0 2 1 0 2
  1568.                         "movq %%mm4, 16(%%edi)   nt"
  1569.                         "psrlq $32, %%mm0        nt" // z z z z 0 2 1 0
  1570.                         "movq %%mm3, 8(%%edi)    nt"
  1571.                         "punpckldq %%mm4, %%mm0  nt" // 1 0 2 1 0 2 1 0
  1572.                         "subl $3, %%esi          nt"
  1573.                         "movq %%mm0, (%%edi)     nt"
  1574.                         "subl $24, %%edi         nt"
  1575.                         "decl %%ecx              nt"
  1576.                         "jnz .loop3_pass0        nt"
  1577.                         "EMMS                    nt" // DONE
  1578.                         : "=c" (dummy_value_c),        // output regs (dummy)
  1579.                           "=S" (dummy_value_S),
  1580.                           "=D" (dummy_value_D),
  1581.                           "=a" (dummy_value_a)
  1582.                         : "1" (sptr),      // esi      // input regs
  1583.                           "2" (dp),        // edi
  1584.                           "0" (width),     // ecx
  1585.                           "3" (&_const4)  // %1(?)  (0x0000000000FFFFFFLL)
  1586. #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1587.                         : "%mm0", "%mm1", "%mm2"       // clobber list
  1588.                         , "%mm3", "%mm4"
  1589. #endif
  1590.                      );
  1591.                   }
  1592.                   else if (((pass == 2) || (pass == 3)) && width)
  1593.                   {
  1594.                      int dummy_value_c;   // fix 'forbidden register spilled'
  1595.                      int dummy_value_S;
  1596.                      int dummy_value_D;
  1597.                      int dummy_value_a;
  1598.                      __asm__ __volatile__ (
  1599.                         "subl $9, %%edi          nt"
  1600.                                      // (png_pass_inc[pass] - 1)*pixel_bytes
  1601.                      ".loop3_pass2:              nt"
  1602.                         "movd (%%esi), %%mm0     nt" // x x x x x 2 1 0
  1603.                         "pand (%3), %%mm0     nt" // z z z z z 2 1 0
  1604.                         "movq %%mm0, %%mm1       nt" // z z z z z 2 1 0
  1605.                         "psllq $16, %%mm0        nt" // z z z 2 1 0 z z
  1606.                         "movq %%mm0, %%mm2       nt" // z z z 2 1 0 z z
  1607.                         "psllq $24, %%mm0        nt" // 2 1 0 z z z z z
  1608.                         "psrlq $8, %%mm1         nt" // z z z z z z 2 1
  1609.                         "por %%mm2, %%mm0        nt" // 2 1 0 2 1 0 z z
  1610.                         "por %%mm1, %%mm0        nt" // 2 1 0 2 1 0 2 1
  1611.                         "movq %%mm0, 4(%%edi)    nt"
  1612.                         "psrlq $16, %%mm0        nt" // z z 2 1 0 2 1 0
  1613.                         "subl $3, %%esi          nt"
  1614.                         "movd %%mm0, (%%edi)     nt"
  1615.                         "subl $12, %%edi         nt"
  1616.                         "decl %%ecx              nt"
  1617.                         "jnz .loop3_pass2        nt"
  1618.                         "EMMS                    nt" // DONE
  1619.                         : "=c" (dummy_value_c),        // output regs (dummy)
  1620.                           "=S" (dummy_value_S),
  1621.                           "=D" (dummy_value_D),
  1622.                           "=a" (dummy_value_a)
  1623.                         : "1" (sptr),      // esi      // input regs
  1624.                           "2" (dp),        // edi
  1625.                           "0" (width),     // ecx
  1626.                           "3" (&_const4)  // (0x0000000000FFFFFFLL)
  1627. #if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1628.                         : "%mm0", "%mm1", "%mm2"       // clobber list
  1629. #endif
  1630.                      );
  1631.                   }
  1632.                   else if (width) /* && ((pass == 4) || (pass == 5)) */
  1633.                   {
  1634.                      int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
  1635.                      if (width_mmx < 0)
  1636.                          width_mmx = 0;
  1637.                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
  1638.                      if (width_mmx)
  1639.                      {
  1640.                         // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
  1641.                         // sptr points at last pixel in pre-expanded row
  1642.                         // dp points at last pixel position in expanded row
  1643.                         int dummy_value_c;  // fix 'forbidden register spilled'
  1644.                         int dummy_value_S;
  1645.                         int dummy_value_D;
  1646.                         int dummy_value_a;
  1647.                         int dummy_value_d;
  1648.                         __asm__ __volatile__ (
  1649.                            "subl $3, %%esi          nt"
  1650.                            "subl $9, %%edi          nt"
  1651.                                         // (png_pass_inc[pass] + 1)*pixel_bytes
  1652.                         ".loop3_pass4:              nt"
  1653.                            "movq (%%esi), %%mm0     nt" // x x 5 4 3 2 1 0
  1654.                            "movq %%mm0, %%mm1       nt" // x x 5 4 3 2 1 0
  1655.                            "movq %%mm0, %%mm2       nt" // x x 5 4 3 2 1 0
  1656.                            "psllq $24, %%mm0        nt" // 4 3 2 1 0 z z z
  1657.                            "pand (%3), %%mm1          nt" // z z z z z 2 1 0
  1658.                            "psrlq $24, %%mm2        nt" // z z z x x 5 4 3
  1659.                            "por %%mm1, %%mm0        nt" // 4 3 2 1 0 2 1 0
  1660.                            "movq %%mm2, %%mm3       nt" // z z z x x 5 4 3
  1661.                            "psllq $8, %%mm2         nt" // z z x x 5 4 3 z
  1662.                            "movq %%mm0, (%%edi)     nt"
  1663.                            "psrlq $16, %%mm3        nt" // z z z z z x x 5
  1664.                            "pand (%4), %%mm3     nt" // z z z z z z z 5
  1665.                            "por %%mm3, %%mm2        nt" // z z x x 5 4 3 5
  1666.                            "subl $6, %%esi          nt"
  1667.                            "movd %%mm2, 8(%%edi)    nt"
  1668.                            "subl $12, %%edi         nt"
  1669.                            "subl $2, %%ecx          nt"
  1670.                            "jnz .loop3_pass4        nt"
  1671.                            "EMMS                    nt" // DONE
  1672.                            : "=c" (dummy_value_c),        // output regs (dummy)
  1673.                              "=S" (dummy_value_S),
  1674.                              "=D" (dummy_value_D),
  1675.                              "=a" (dummy_value_a),
  1676.                              "=d" (dummy_value_d)
  1677.                            : "1" (sptr),      // esi      // input regs
  1678.                              "2" (dp),        // edi
  1679.                              "0" (width_mmx), // ecx
  1680.                              "3" (&_const4), // 0x0000000000FFFFFFLL
  1681.                              "4" (&_const6)  // 0x00000000000000FFLL
  1682. #if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1683.                            : "%mm0", "%mm1"               // clobber list
  1684.                            , "%mm2", "%mm3"
  1685. #endif
  1686.                         );
  1687.                      }
  1688.                      sptr -= width_mmx*3;
  1689.                      dp -= width_mmx*6;
  1690.                      for (i = width; i; i--)
  1691.                      {
  1692.                         png_byte v[8];
  1693.                         int j;
  1694.                         png_memcpy(v, sptr, 3);
  1695.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1696.                         {
  1697.                            png_memcpy(dp, v, 3);
  1698.                            dp -= 3;
  1699.                         }
  1700.                         sptr -= 3;
  1701.                      }
  1702.                   }
  1703.                } /* end of pixel_bytes == 3 */
  1704.                //--------------------------------------------------------------
  1705.                else if (pixel_bytes == 1)
  1706.                {
  1707.                   if (((pass == 0) || (pass == 1)) && width)
  1708.                   {
  1709.                      int width_mmx = ((width >> 2) << 2);
  1710.                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
  1711.                      if (width_mmx)
  1712.                      {
  1713.                         int dummy_value_c;  // fix 'forbidden register spilled'
  1714.                         int dummy_value_S;
  1715.                         int dummy_value_D;
  1716.                         __asm__ __volatile__ (
  1717.                            "subl $3, %%esi          nt"
  1718.                            "subl $31, %%edi         nt"
  1719.                         ".loop1_pass0:              nt"
  1720.                            "movd (%%esi), %%mm0     nt" // x x x x 3 2 1 0
  1721.                            "movq %%mm0, %%mm1       nt" // x x x x 3 2 1 0
  1722.                            "punpcklbw %%mm0, %%mm0  nt" // 3 3 2 2 1 1 0 0
  1723.                            "movq %%mm0, %%mm2       nt" // 3 3 2 2 1 1 0 0
  1724.                            "punpcklwd %%mm0, %%mm0  nt" // 1 1 1 1 0 0 0 0
  1725.                            "movq %%mm0, %%mm3       nt" // 1 1 1 1 0 0 0 0
  1726.                            "punpckldq %%mm0, %%mm0  nt" // 0 0 0 0 0 0 0 0
  1727.                            "punpckhdq %%mm3, %%mm3  nt" // 1 1 1 1 1 1 1 1
  1728.                            "movq %%mm0, (%%edi)     nt"
  1729.                            "punpckhwd %%mm2, %%mm2  nt" // 3 3 3 3 2 2 2 2
  1730.                            "movq %%mm3, 8(%%edi)    nt"
  1731.                            "movq %%mm2, %%mm4       nt" // 3 3 3 3 2 2 2 2
  1732.                            "punpckldq %%mm2, %%mm2  nt" // 2 2 2 2 2 2 2 2
  1733.                            "punpckhdq %%mm4, %%mm4  nt" // 3 3 3 3 3 3 3 3
  1734.                            "movq %%mm2, 16(%%edi)   nt"
  1735.                            "subl $4, %%esi          nt"
  1736.                            "movq %%mm4, 24(%%edi)   nt"
  1737.                            "subl $32, %%edi         nt"
  1738.                            "subl $4, %%ecx          nt"
  1739.                            "jnz .loop1_pass0        nt"
  1740.                            "EMMS                    nt" // DONE
  1741.                            : "=c" (dummy_value_c),        // output regs (dummy)
  1742.                              "=S" (dummy_value_S),
  1743.                              "=D" (dummy_value_D)
  1744.                            : "1" (sptr),      // esi      // input regs
  1745.                              "2" (dp),        // edi
  1746.                              "0" (width_mmx)  // ecx
  1747. #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1748.                            : "%mm0", "%mm1", "%mm2"       // clobber list
  1749.                            , "%mm3", "%mm4"
  1750. #endif
  1751.                         );
  1752.                      }
  1753.                      sptr -= width_mmx;
  1754.                      dp -= width_mmx*8;
  1755.                      for (i = width; i; i--)
  1756.                      {
  1757.                         int j;
  1758.                        /* I simplified this part in version 1.0.4e
  1759.                         * here and in several other instances where
  1760.                         * pixel_bytes == 1  -- GR-P
  1761.                         *
  1762.                         * Original code:
  1763.                         *
  1764.                         * png_byte v[8];
  1765.                         * png_memcpy(v, sptr, pixel_bytes);
  1766.                         * for (j = 0; j < png_pass_inc[pass]; j++)
  1767.                         * {
  1768.                         *    png_memcpy(dp, v, pixel_bytes);
  1769.                         *    dp -= pixel_bytes;
  1770.                         * }
  1771.                         * sptr -= pixel_bytes;
  1772.                         *
  1773.                         * Replacement code is in the next three lines:
  1774.                         */
  1775.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1776.                         {
  1777.                            *dp-- = *sptr;
  1778.                         }
  1779.                         --sptr;
  1780.                      }
  1781.                   }
  1782.                   else if (((pass == 2) || (pass == 3)) && width)
  1783.                   {
  1784.                      int width_mmx = ((width >> 2) << 2);
  1785.                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
  1786.                      if (width_mmx)
  1787.                      {
  1788.                         int dummy_value_c;  // fix 'forbidden register spilled'
  1789.                         int dummy_value_S;
  1790.                         int dummy_value_D;
  1791.                         __asm__ __volatile__ (
  1792.                            "subl $3, %%esi          nt"
  1793.                            "subl $15, %%edi         nt"
  1794.                         ".loop1_pass2:              nt"
  1795.                            "movd (%%esi), %%mm0     nt" // x x x x 3 2 1 0
  1796.                            "punpcklbw %%mm0, %%mm0  nt" // 3 3 2 2 1 1 0 0
  1797.                            "movq %%mm0, %%mm1       nt" // 3 3 2 2 1 1 0 0
  1798.                            "punpcklwd %%mm0, %%mm0  nt" // 1 1 1 1 0 0 0 0
  1799.                            "punpckhwd %%mm1, %%mm1  nt" // 3 3 3 3 2 2 2 2
  1800.                            "movq %%mm0, (%%edi)     nt"
  1801.                            "subl $4, %%esi          nt"
  1802.                            "movq %%mm1, 8(%%edi)    nt"
  1803.                            "subl $16, %%edi         nt"
  1804.                            "subl $4, %%ecx          nt"
  1805.                            "jnz .loop1_pass2        nt"
  1806.                            "EMMS                    nt" // DONE
  1807.                            : "=c" (dummy_value_c),        // output regs (dummy)
  1808.                              "=S" (dummy_value_S),
  1809.                              "=D" (dummy_value_D)
  1810.                            : "1" (sptr),      // esi      // input regs
  1811.                              "2" (dp),        // edi
  1812.                              "0" (width_mmx)  // ecx
  1813. #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1814.                            : "%mm0", "%mm1"               // clobber list
  1815. #endif
  1816.                         );
  1817.                      }
  1818.                      sptr -= width_mmx;
  1819.                      dp -= width_mmx*4;
  1820.                      for (i = width; i; i--)
  1821.                      {
  1822.                         int j;
  1823.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1824.                         {
  1825.                            *dp-- = *sptr;
  1826.                         }
  1827.                         --sptr;
  1828.                      }
  1829.                   }
  1830.                   else if (width)  /* && ((pass == 4) || (pass == 5)) */
  1831.                   {
  1832.                      int width_mmx = ((width >> 3) << 3);
  1833.                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
  1834.                      if (width_mmx)
  1835.                      {
  1836.                         int dummy_value_c;  // fix 'forbidden register spilled'
  1837.                         int dummy_value_S;
  1838.                         int dummy_value_D;
  1839.                         __asm__ __volatile__ (
  1840.                            "subl $7, %%esi          nt"
  1841.                            "subl $15, %%edi         nt"
  1842.                         ".loop1_pass4:              nt"
  1843.                            "movq (%%esi), %%mm0     nt" // 7 6 5 4 3 2 1 0
  1844.                            "movq %%mm0, %%mm1       nt" // 7 6 5 4 3 2 1 0
  1845.                            "punpcklbw %%mm0, %%mm0  nt" // 3 3 2 2 1 1 0 0
  1846.                            "punpckhbw %%mm1, %%mm1  nt" // 7 7 6 6 5 5 4 4
  1847.                            "movq %%mm1, 8(%%edi)    nt"
  1848.                            "subl $8, %%esi          nt"
  1849.                            "movq %%mm0, (%%edi)     nt"
  1850.                            "subl $16, %%edi         nt"
  1851.                            "subl $8, %%ecx          nt"
  1852.                            "jnz .loop1_pass4        nt"
  1853.                            "EMMS                    nt" // DONE
  1854.                            : "=c" (dummy_value_c),        // output regs (none)
  1855.                              "=S" (dummy_value_S),
  1856.                              "=D" (dummy_value_D)
  1857.                            : "1" (sptr),      // esi      // input regs
  1858.                              "2" (dp),        // edi
  1859.                              "0" (width_mmx)  // ecx
  1860. #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1861.                            : "%mm0", "%mm1"               // clobber list
  1862. #endif
  1863.                         );
  1864.                      }
  1865.                      sptr -= width_mmx;
  1866.                      dp -= width_mmx*2;
  1867.                      for (i = width; i; i--)
  1868.                      {
  1869.                         int j;
  1870.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1871.                         {
  1872.                            *dp-- = *sptr;
  1873.                         }
  1874.                         --sptr;
  1875.                      }
  1876.                   }
  1877.                } /* end of pixel_bytes == 1 */
  1878.                //--------------------------------------------------------------
  1879.                else if (pixel_bytes == 2)
  1880.                {
  1881.                   if (((pass == 0) || (pass == 1)) && width)
  1882.                   {
  1883.                      int width_mmx = ((width >> 1) << 1);
  1884.                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
  1885.                      if (width_mmx)
  1886.                      {
  1887.                         int dummy_value_c;  // fix 'forbidden register spilled'
  1888.                         int dummy_value_S;
  1889.                         int dummy_value_D;
  1890.                         __asm__ __volatile__ (
  1891.                            "subl $2, %%esi          nt"
  1892.                            "subl $30, %%edi         nt"
  1893.                         ".loop2_pass0:              nt"
  1894.                            "movd (%%esi), %%mm0     nt" // x x x x 3 2 1 0
  1895.                            "punpcklwd %%mm0, %%mm0  nt" // 3 2 3 2 1 0 1 0
  1896.                            "movq %%mm0, %%mm1       nt" // 3 2 3 2 1 0 1 0
  1897.                            "punpckldq %%mm0, %%mm0  nt" // 1 0 1 0 1 0 1 0
  1898.                            "punpckhdq %%mm1, %%mm1  nt" // 3 2 3 2 3 2 3 2
  1899.                            "movq %%mm0, (%%edi)     nt"
  1900.                            "movq %%mm0, 8(%%edi)    nt"
  1901.                            "movq %%mm1, 16(%%edi)   nt"
  1902.                            "subl $4, %%esi          nt"
  1903.                            "movq %%mm1, 24(%%edi)   nt"
  1904.                            "subl $32, %%edi         nt"
  1905.                            "subl $2, %%ecx          nt"
  1906.                            "jnz .loop2_pass0        nt"
  1907.                            "EMMS                    nt" // DONE
  1908.                            : "=c" (dummy_value_c),        // output regs (dummy)
  1909.                              "=S" (dummy_value_S),
  1910.                              "=D" (dummy_value_D)
  1911.                            : "1" (sptr),      // esi      // input regs
  1912.                              "2" (dp),        // edi
  1913.                              "0" (width_mmx)  // ecx
  1914. #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1915.                            : "%mm0", "%mm1"               // clobber list
  1916. #endif
  1917.                         );
  1918.                      }
  1919.                      sptr -= (width_mmx*2 - 2); // sign fixed
  1920.                      dp -= (width_mmx*16 - 2);  // sign fixed
  1921.                      for (i = width; i; i--)
  1922.                      {
  1923.                         png_byte v[8];
  1924.                         int j;
  1925.                         sptr -= 2;
  1926.                         png_memcpy(v, sptr, 2);
  1927.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1928.                         {
  1929.                            dp -= 2;
  1930.                            png_memcpy(dp, v, 2);
  1931.                         }
  1932.                      }
  1933.                   }
  1934.                   else if (((pass == 2) || (pass == 3)) && width)
  1935.                   {
  1936.                      int width_mmx = ((width >> 1) << 1) ;
  1937.                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
  1938.                      if (width_mmx)
  1939.                      {
  1940.                         int dummy_value_c;  // fix 'forbidden register spilled'
  1941.                         int dummy_value_S;
  1942.                         int dummy_value_D;
  1943.                         __asm__ __volatile__ (
  1944.                            "subl $2, %%esi          nt"
  1945.                            "subl $14, %%edi         nt"
  1946.                         ".loop2_pass2:              nt"
  1947.                            "movd (%%esi), %%mm0     nt" // x x x x 3 2 1 0
  1948.                            "punpcklwd %%mm0, %%mm0  nt" // 3 2 3 2 1 0 1 0
  1949.                            "movq %%mm0, %%mm1       nt" // 3 2 3 2 1 0 1 0
  1950.                            "punpckldq %%mm0, %%mm0  nt" // 1 0 1 0 1 0 1 0
  1951.                            "punpckhdq %%mm1, %%mm1  nt" // 3 2 3 2 3 2 3 2
  1952.                            "movq %%mm0, (%%edi)     nt"
  1953.                            "subl $4, %%esi          nt"
  1954.                            "movq %%mm1, 8(%%edi)    nt"
  1955.                            "subl $16, %%edi         nt"
  1956.                            "subl $2, %%ecx          nt"
  1957.                            "jnz .loop2_pass2        nt"
  1958.                            "EMMS                    nt" // DONE
  1959.                            : "=c" (dummy_value_c),        // output regs (dummy)
  1960.                              "=S" (dummy_value_S),
  1961.                              "=D" (dummy_value_D)
  1962.                            : "1" (sptr),      // esi      // input regs
  1963.                              "2" (dp),        // edi
  1964.                              "0" (width_mmx)  // ecx
  1965. #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1966.                            : "%mm0", "%mm1"               // clobber list
  1967. #endif
  1968.                         );
  1969.                      }
  1970.                      sptr -= (width_mmx*2 - 2); // sign fixed
  1971.                      dp -= (width_mmx*8 - 2);   // sign fixed
  1972.                      for (i = width; i; i--)
  1973.                      {
  1974.                         png_byte v[8];
  1975.                         int j;
  1976.                         sptr -= 2;
  1977.                         png_memcpy(v, sptr, 2);
  1978.                         for (j = 0; j < png_pass_inc[pass]; j++)
  1979.                         {
  1980.                            dp -= 2;
  1981.                            png_memcpy(dp, v, 2);
  1982.                         }
  1983.                      }
  1984.                   }
  1985.                   else if (width)  // pass == 4 or 5
  1986.                   {
  1987.                      int width_mmx = ((width >> 1) << 1) ;
  1988.                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
  1989.                      if (width_mmx)
  1990.                      {
  1991.                         int dummy_value_c;  // fix 'forbidden register spilled'
  1992.                         int dummy_value_S;
  1993.                         int dummy_value_D;
  1994.                         __asm__ __volatile__ (
  1995.                            "subl $2, %%esi          nt"
  1996.                            "subl $6, %%edi          nt"
  1997.                         ".loop2_pass4:              nt"
  1998.                            "movd (%%esi), %%mm0     nt" // x x x x 3 2 1 0
  1999.                            "punpcklwd %%mm0, %%mm0  nt" // 3 2 3 2 1 0 1 0
  2000.                            "subl $4, %%esi          nt"
  2001.                            "movq %%mm0, (%%edi)     nt"
  2002.                            "subl $8, %%edi          nt"
  2003.                            "subl $2, %%ecx          nt"
  2004.                            "jnz .loop2_pass4        nt"
  2005.                            "EMMS                    nt" // DONE
  2006.                            : "=c" (dummy_value_c),        // output regs (dummy)
  2007.                              "=S" (dummy_value_S),
  2008.                              "=D" (dummy_value_D)
  2009.                            : "1" (sptr),      // esi      // input regs
  2010.                              "2" (dp),        // edi
  2011.                              "0" (width_mmx)  // ecx
  2012. #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2013.                            : "%mm0"                       // clobber list
  2014. #endif
  2015.                         );
  2016.                      }
  2017.                      sptr -= (width_mmx*2 - 2); // sign fixed
  2018.                      dp -= (width_mmx*4 - 2);   // sign fixed
  2019.                      for (i = width; i; i--)
  2020.                      {
  2021.                         png_byte v[8];
  2022.                         int j;
  2023.                         sptr -= 2;
  2024.                         png_memcpy(v, sptr, 2);
  2025.                         for (j = 0; j < png_pass_inc[pass]; j++)
  2026.                         {
  2027.                            dp -= 2;
  2028.                            png_memcpy(dp, v, 2);
  2029.                         }
  2030.                      }
  2031.                   }
  2032.                } /* end of pixel_bytes == 2 */
  2033.                //--------------------------------------------------------------
  2034.                else if (pixel_bytes == 4)
  2035.                {
  2036.                   if (((pass == 0) || (pass == 1)) && width)
  2037.                   {
  2038.                      int width_mmx = ((width >> 1) << 1);
  2039.                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
  2040.                      if (width_mmx)
  2041.                      {
  2042.                         int dummy_value_c;  // fix 'forbidden register spilled'
  2043.                         int dummy_value_S;
  2044.                         int dummy_value_D;
  2045.                         __asm__ __volatile__ (
  2046.                            "subl $4, %%esi          nt"
  2047.                            "subl $60, %%edi         nt"
  2048.                         ".loop4_pass0:              nt"
  2049.                            "movq (%%esi), %%mm0     nt" // 7 6 5 4 3 2 1 0
  2050.                            "movq %%mm0, %%mm1       nt" // 7 6 5 4 3 2 1 0
  2051.                            "punpckldq %%mm0, %%mm0  nt" // 3 2 1 0 3 2 1 0
  2052.                            "punpckhdq %%mm1, %%mm1  nt" // 7 6 5 4 7 6 5 4
  2053.                            "movq %%mm0, (%%edi)     nt"
  2054.                            "movq %%mm0, 8(%%edi)    nt"
  2055.                            "movq %%mm0, 16(%%edi)   nt"
  2056.                            "movq %%mm0, 24(%%edi)   nt"
  2057.                            "movq %%mm1, 32(%%edi)   nt"
  2058.                            "movq %%mm1, 40(%%edi)   nt"
  2059.                            "movq %%mm1, 48(%%edi)   nt"
  2060.                            "subl $8, %%esi          nt"
  2061.                            "movq %%mm1, 56(%%edi)   nt"
  2062.                            "subl $64, %%edi         nt"
  2063.                            "subl $2, %%ecx          nt"
  2064.                            "jnz .loop4_pass0        nt"
  2065.                            "EMMS                    nt" // DONE
  2066.                            : "=c" (dummy_value_c),        // output regs (dummy)
  2067.                              "=S" (dummy_value_S),
  2068.                              "=D" (dummy_value_D)
  2069.                            : "1" (sptr),      // esi      // input regs
  2070.                              "2" (dp),        // edi
  2071.                              "0" (width_mmx)  // ecx
  2072. #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2073.                            : "%mm0", "%mm1"               // clobber list
  2074. #endif
  2075.                         );
  2076.                      }
  2077.                      sptr -= (width_mmx*4 - 4); // sign fixed
  2078.                      dp -= (width_mmx*32 - 4);  // sign fixed
  2079.                      for (i = width; i; i--)
  2080.                      {
  2081.                         png_byte v[8];
  2082.                         int j;
  2083.                         sptr -= 4;
  2084.                         png_memcpy(v, sptr, 4);
  2085.                         for (j = 0; j < png_pass_inc[pass]; j++)
  2086.                         {
  2087.                            dp -= 4;
  2088.                            png_memcpy(dp, v, 4);
  2089.                         }
  2090.                      }
  2091.                   }
  2092.                   else if (((pass == 2) || (pass == 3)) && width)
  2093.                   {
  2094.                      int width_mmx = ((width >> 1) << 1);
  2095.                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
  2096.                      if (width_mmx)
  2097.                      {
  2098.                         int dummy_value_c;  // fix 'forbidden register spilled'
  2099.                         int dummy_value_S;
  2100.                         int dummy_value_D;
  2101.                         __asm__ __volatile__ (
  2102.                            "subl $4, %%esi          nt"
  2103.                            "subl $28, %%edi         nt"
  2104.                         ".loop4_pass2:              nt"
  2105.                            "movq (%%esi), %%mm0     nt" // 7 6 5 4 3 2 1 0
  2106.                            "movq %%mm0, %%mm1       nt" // 7 6 5 4 3 2 1 0
  2107.                            "punpckldq %%mm0, %%mm0  nt" // 3 2 1 0 3 2 1 0
  2108.                            "punpckhdq %%mm1, %%mm1  nt" // 7 6 5 4 7 6 5 4
  2109.                            "movq %%mm0, (%%edi)     nt"
  2110.                            "movq %%mm0, 8(%%edi)    nt"
  2111.                            "movq %%mm1, 16(%%edi)   nt"
  2112.                            "movq %%mm1, 24(%%edi)   nt"
  2113.                            "subl $8, %%esi          nt"
  2114.                            "subl $32, %%edi         nt"
  2115.                            "subl $2, %%ecx          nt"
  2116.                            "jnz .loop4_pass2        nt"
  2117.                            "EMMS                    nt" // DONE
  2118.                            : "=c" (dummy_value_c),        // output regs (dummy)
  2119.                              "=S" (dummy_value_S),
  2120.                              "=D" (dummy_value_D)
  2121.                            : "1" (sptr),      // esi      // input regs
  2122.                              "2" (dp),        // edi
  2123.                              "0" (width_mmx)  // ecx
  2124. #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2125.                            : "%mm0", "%mm1"               // clobber list
  2126. #endif
  2127.                         );
  2128.                      }
  2129.                      sptr -= (width_mmx*4 - 4); // sign fixed
  2130.                      dp -= (width_mmx*16 - 4);  // sign fixed
  2131.                      for (i = width; i; i--)
  2132.                      {
  2133.                         png_byte v[8];
  2134.                         int j;
  2135.                         sptr -= 4;
  2136.                         png_memcpy(v, sptr, 4);
  2137.                         for (j = 0; j < png_pass_inc[pass]; j++)
  2138.                         {
  2139.                            dp -= 4;
  2140.                            png_memcpy(dp, v, 4);
  2141.                         }
  2142.                      }
  2143.                   }
  2144.                   else if (width)  // pass == 4 or 5
  2145.                   {
  2146.                      int width_mmx = ((width >> 1) << 1) ;
  2147.                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
  2148.                      if (width_mmx)
  2149.                      {
  2150.                         int dummy_value_c;  // fix 'forbidden register spilled'
  2151.                         int dummy_value_S;
  2152.                         int dummy_value_D;
  2153.                         __asm__ __volatile__ (
  2154.                            "subl $4, %%esi          nt"
  2155.                            "subl $12, %%edi         nt"
  2156.                         ".loop4_pass4:              nt"
  2157.                            "movq (%%esi), %%mm0     nt" // 7 6 5 4 3 2 1 0
  2158.                            "movq %%mm0, %%mm1       nt" // 7 6 5 4 3 2 1 0
  2159.                            "punpckldq %%mm0, %%mm0  nt" // 3 2 1 0 3 2 1 0
  2160.                            "punpckhdq %%mm1, %%mm1  nt" // 7 6 5 4 7 6 5 4
  2161.                            "movq %%mm0, (%%edi)     nt"
  2162.                            "subl $8, %%esi          nt"
  2163.                            "movq %%mm1, 8(%%edi)    nt"
  2164.                            "subl $16, %%edi         nt"
  2165.                            "subl $2, %%ecx          nt"
  2166.                            "jnz .loop4_pass4        nt"
  2167.                            "EMMS                    nt" // DONE
  2168.                            : "=c" (dummy_value_c),        // output regs (dummy)
  2169.                              "=S" (dummy_value_S),
  2170.                              "=D" (dummy_value_D)
  2171.                            : "1" (sptr),      // esi      // input regs
  2172.                              "2" (dp),        // edi
  2173.                              "0" (width_mmx)  // ecx
  2174. #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2175.                            : "%mm0", "%mm1"               // clobber list
  2176. #endif
  2177.                         );
  2178.                      }
  2179.                      sptr -= (width_mmx*4 - 4); // sign fixed
  2180.                      dp -= (width_mmx*8 - 4);   // sign fixed
  2181.                      for (i = width; i; i--)
  2182.                      {
  2183.                         png_byte v[8];
  2184.                         int j;
  2185.                         sptr -= 4;
  2186.                         png_memcpy(v, sptr, 4);
  2187.                         for (j = 0; j < png_pass_inc[pass]; j++)
  2188.                         {
  2189.                            dp -= 4;
  2190.                            png_memcpy(dp, v, 4);
  2191.                         }
  2192.                      }
  2193.                   }
  2194.                } /* end of pixel_bytes == 4 */
  2195.                //--------------------------------------------------------------
  2196.                else if (pixel_bytes == 8)
  2197.                {
  2198. // GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
  2199.                   // GRR NOTE:  no need to combine passes here!
  2200.                   if (((pass == 0) || (pass == 1)) && width)
  2201.                   {
  2202.                      int dummy_value_c;  // fix 'forbidden register spilled'
  2203.                      int dummy_value_S;
  2204.                      int dummy_value_D;
  2205.                      // source is 8-byte RRGGBBAA
  2206.                      // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
  2207.                      __asm__ __volatile__ (
  2208.                         "subl $56, %%edi         nt" // start of last block
  2209.                      ".loop8_pass0:              nt"
  2210.                         "movq (%%esi), %%mm0     nt" // 7 6 5 4 3 2 1 0
  2211.                         "movq %%mm0, (%%edi)     nt"
  2212.                         "movq %%mm0, 8(%%edi)    nt"
  2213.                         "movq %%mm0, 16(%%edi)   nt"
  2214.                         "movq %%mm0, 24(%%edi)   nt"
  2215.                         "movq %%mm0, 32(%%edi)   nt"
  2216.                         "movq %%mm0, 40(%%edi)   nt"
  2217.                         "movq %%mm0, 48(%%edi)   nt"
  2218.                         "subl $8, %%esi          nt"
  2219.                         "movq %%mm0, 56(%%edi)   nt"
  2220.                         "subl $64, %%edi         nt"
  2221.                         "decl %%ecx              nt"
  2222.                         "jnz .loop8_pass0        nt"
  2223.                         "EMMS                    nt" // DONE
  2224.                         : "=c" (dummy_value_c),        // output regs (dummy)
  2225.                           "=S" (dummy_value_S),
  2226.                           "=D" (dummy_value_D)
  2227.                         : "1" (sptr),      // esi      // input regs
  2228.                           "2" (dp),        // edi
  2229.                           "0" (width)      // ecx
  2230. #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2231.                         : "%mm0"                       // clobber list
  2232. #endif
  2233.                      );
  2234.                   }
  2235.                   else if (((pass == 2) || (pass == 3)) && width)
  2236.                   {
  2237.                      // source is 8-byte RRGGBBAA
  2238.                      // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
  2239.                      // (recall that expansion is _in place_:  sptr and dp
  2240.                      //  both point at locations within same row buffer)
  2241.                      {
  2242.                         int dummy_value_c;  // fix 'forbidden register spilled'
  2243.                         int dummy_value_S;
  2244.                         int dummy_value_D;
  2245.                         __asm__ __volatile__ (
  2246.                            "subl $24, %%edi         nt" // start of last block
  2247.                         ".loop8_pass2:              nt"
  2248.                            "movq (%%esi), %%mm0     nt" // 7 6 5 4 3 2 1 0
  2249.                            "movq %%mm0, (%%edi)     nt"
  2250.                            "movq %%mm0, 8(%%edi)    nt"
  2251.                            "movq %%mm0, 16(%%edi)   nt"
  2252.                            "subl $8, %%esi          nt"
  2253.                            "movq %%mm0, 24(%%edi)   nt"
  2254.                            "subl $32, %%edi         nt"
  2255.                            "decl %%ecx              nt"
  2256.                            "jnz .loop8_pass2        nt"
  2257.                            "EMMS                    nt" // DONE
  2258.                            : "=c" (dummy_value_c),        // output regs (dummy)
  2259.                              "=S" (dummy_value_S),
  2260.                              "=D" (dummy_value_D)
  2261.                            : "1" (sptr),      // esi      // input regs
  2262.                              "2" (dp),        // edi
  2263.                              "0" (width)      // ecx
  2264. #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2265.                            : "%mm0"                       // clobber list
  2266. #endif
  2267.                         );
  2268.                      }
  2269.                   }
  2270.                   else if (width)  // pass == 4 or 5
  2271.                   {
  2272.                      // source is 8-byte RRGGBBAA
  2273.                      // dest is 16-byte RRGGBBAA RRGGBBAA
  2274.                      {
  2275.                         int dummy_value_c;  // fix 'forbidden register spilled'
  2276.                         int dummy_value_S;
  2277.                         int dummy_value_D;
  2278.                         __asm__ __volatile__ (
  2279.                            "subl $8, %%edi          nt" // start of last block
  2280.                         ".loop8_pass4:              nt"
  2281.                            "movq (%%esi), %%mm0     nt" // 7 6 5 4 3 2 1 0
  2282.                            "movq %%mm0, (%%edi)     nt"
  2283.                            "subl $8, %%esi          nt"
  2284.                            "movq %%mm0, 8(%%edi)    nt"
  2285.                            "subl $16, %%edi         nt"
  2286.                            "decl %%ecx              nt"
  2287.                            "jnz .loop8_pass4        nt"
  2288.                            "EMMS                    nt" // DONE
  2289.                            : "=c" (dummy_value_c),        // output regs (dummy)
  2290.                              "=S" (dummy_value_S),
  2291.                              "=D" (dummy_value_D)
  2292.                            : "1" (sptr),      // esi      // input regs
  2293.                              "2" (dp),        // edi
  2294.                              "0" (width)      // ecx
  2295. #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2296.                            : "%mm0"                       // clobber list
  2297. #endif
  2298.                         );
  2299.                      }
  2300.                   }
  2301.                } /* end of pixel_bytes == 8 */
  2302.                //--------------------------------------------------------------
  2303.                else if (pixel_bytes == 6)
  2304.                {
  2305.                   for (i = width; i; i--)
  2306.                   {
  2307.                      png_byte v[8];
  2308.                      int j;
  2309.                      png_memcpy(v, sptr, 6);
  2310.                      for (j = 0; j < png_pass_inc[pass]; j++)
  2311.                      {
  2312.                         png_memcpy(dp, v, 6);
  2313.                         dp -= 6;
  2314.                      }
  2315.                      sptr -= 6;
  2316.                   }
  2317.                } /* end of pixel_bytes == 6 */
  2318.                //--------------------------------------------------------------
  2319.                else
  2320.                {
  2321.                   for (i = width; i; i--)
  2322.                   {
  2323.                      png_byte v[8];
  2324.                      int j;
  2325.                      png_memcpy(v, sptr, pixel_bytes);
  2326.                      for (j = 0; j < png_pass_inc[pass]; j++)
  2327.                      {
  2328.                         png_memcpy(dp, v, pixel_bytes);
  2329.                         dp -= pixel_bytes;
  2330.                      }
  2331.                      sptr-= pixel_bytes;
  2332.                   }
  2333.                }
  2334.             } // end of _mmx_supported ========================================
  2335.             else /* MMX not supported:  use modified C code - takes advantage
  2336.                   *   of inlining of png_memcpy for a constant */
  2337.                  /* GRR 19991007:  does it?  or should pixel_bytes in each
  2338.                   *   block be replaced with immediate value (e.g., 1)? */
  2339.                  /* GRR 19991017:  replaced with constants in each case */
  2340. #endif /* PNG_MMX_CODE_SUPPORTED */
  2341.             {
  2342.                if (pixel_bytes == 1)
  2343.                {
  2344.                   for (i = width; i; i--)
  2345.                   {
  2346.                      int j;
  2347.                      for (j = 0; j < png_pass_inc[pass]; j++)
  2348.                      {
  2349.                         *dp-- = *sptr;
  2350.                      }
  2351.                      --sptr;
  2352.                   }
  2353.                }
  2354.                else if (pixel_bytes == 3)
  2355.                {
  2356.                   for (i = width; i; i--)
  2357.                   {
  2358.                      png_byte v[8];
  2359.                      int j;
  2360.                      png_memcpy(v, sptr, 3);
  2361.                      for (j = 0; j < png_pass_inc[pass]; j++)
  2362.                      {
  2363.                         png_memcpy(dp, v, 3);
  2364.                         dp -= 3;
  2365.                      }
  2366.                      sptr -= 3;
  2367.                   }
  2368.                }
  2369.                else if (pixel_bytes == 2)
  2370.                {
  2371.                   for (i = width; i; i--)
  2372.                   {
  2373.                      png_byte v[8];
  2374.                      int j;
  2375.                      png_memcpy(v, sptr, 2);
  2376.                      for (j = 0; j < png_pass_inc[pass]; j++)
  2377.                      {
  2378.                         png_memcpy(dp, v, 2);
  2379.                         dp -= 2;
  2380.                      }
  2381.                      sptr -= 2;
  2382.                   }
  2383.                }
  2384.                else if (pixel_bytes == 4)
  2385.                {
  2386.                   for (i = width; i; i--)
  2387.                   {
  2388.                      png_byte v[8];
  2389.                      int j;
  2390.                      png_memcpy(v, sptr, 4);
  2391.                      for (j = 0; j < png_pass_inc[pass]; j++)
  2392.                      {
  2393. #ifdef PNG_DEBUG
  2394.                         if (dp < row || dp+3 > row+png_ptr->row_buf_size)
  2395.                         {
  2396.                            printf("dp out of bounds: row=%d, dp=%d, rp=%dn",
  2397.                              row, dp, row+png_ptr->row_buf_size);
  2398.                            printf("row_buf=%dn",png_ptr->row_buf_size);
  2399.                         }
  2400. #endif
  2401.                         png_memcpy(dp, v, 4);
  2402.                         dp -= 4;
  2403.                      }
  2404.                      sptr -= 4;
  2405.                   }
  2406.                }
  2407.                else if (pixel_bytes == 6)
  2408.                {
  2409.                   for (i = width; i; i--)
  2410.                   {
  2411.                      png_byte v[8];
  2412.                      int j;
  2413.                      png_memcpy(v, sptr, 6);
  2414.                      for (j = 0; j < png_pass_inc[pass]; j++)
  2415.                      {
  2416.                         png_memcpy(dp, v, 6);
  2417.                         dp -= 6;
  2418.                      }
  2419.                      sptr -= 6;
  2420.                   }