DSP编程

开发平台：
C/C++

ycbcr422pl_to_rgb565_h.asm：源码内容
							;
;  Copyright 2003 by Texas Instruments Incorporated.
;  All rights reserved. Property of Texas Instruments Incorporated.
;  Restricted rights to use, duplicate or disclose this code are
;  granted through contract.
;  
;
; "@(#) DDK 1.10.00.21 06-26-03 (ddk-b10)"
* ========================================================================= *
*                                                                           *
*   USAGE                                                                   *
*       This function is C callable, and is called according to this        *
*       C prototype:                                                        *
*                                                                           *
*       void ycbcr422pl_to_rgb565                                           *
*       (                                                                   *
*           const short         coeff[5],  -- Matrix coefficients.          *
*           const unsigned char *y_data,   -- Luminence data  (Y')          *
*           const unsigned char *cb_data,  -- Blue color-diff (B'-Y')       *
*           const unsigned char *cr_data,  -- Red color-diff  (R'-Y')       *
*           unsigned short      *rgb_data, -- RGB 5:6:5 packed pixel out.   *
*           unsigned            num_pixels -- # of luma pixels to process.  *
*       )                                                                   *
*                                                                           *
*       The 'coeff[]' array contains the color-space-conversion matrix      *
*       coefficients.  The 'y_data', 'cb_data' and 'cr_data' pointers       *
*       point to the separate input image planes.  The 'rgb_data' pointer   *
*       points to the output image buffer, and must be word aligned.        *
*                                                                           *
*       The kernel is designed to process arbitrary amounts of 4:2:2        *
*       image data, although 4:2:0 image data may be processed as well.     *
*       For 4:2:2 input data, the 'y_data', 'cb_data' and 'cr_data'         *
*       arrays may hold an arbitrary amount of image data.  For 4:2:0       *
*       input data, only a single scan-line (or portion thereof) may be     *
*       processed at a time.                                                *
*                                                                           *
*       The coefficients in the coeff array must be in signed Q13 form.     *
*       These coefficients correspond to the following matrix equation:     *
*                                                                           *
*           [ Y' -  16 ]   [ coeff[0] 0.0000   coeff[1] ]     [ R']         *
*           [ Cb - 128 ] * [ coeff[0] coeff[2] coeff[3] ]  =  [ G']         *
*           [ Cr - 128 ]   [ coeff[0] coeff[4] 0.0000   ]     [ B']         *
*                                                                           *
*   DESCRIPTION                                                             *
*       This function runs for 46 + (num_pixels * 3) cycles, including      *
*       6 cycles of function-call overhead.  Interrupts are masked for      *
*       37 + (num_pixels * 3) cycles.  Code size is 512 bytes.              *
*                                                                           *
*       This kernel performs Y'CbCr to RGB conversion.  From the Color      *
*       FAQ, http://home.inforamp.net/~poynton/ColorFAQ.html :              *
*                                                                           *
*           Various scale factors are applied to (B'-Y') and (R'-Y')        *
*           for different applications.  The Y'PbPr scale factors are       *
*           optimized for component analog video.  The Y'CbCr scaling       *
*           is appropriate for component digital video, JPEG and MPEG.      *
*           Kodak's PhotoYCC(tm) uses scale factors optimized for the       *
*           gamut of film colors.  Y'UV scaling is appropriate as an        *
*           intermediate step in the formation of composite NTSC or PAL     *
*           video signals, but is not appropriate when the components       *
*           are keps separate.  Y'UV nomenclature is now used rather        *
*           loosely, and it sometimes denotes any scaling of (B'-Y')        *
*           and (R'-Y').  Y'IQ coding is obsolete.                          *
*                                                                           *
*       This code can perform various flavors of Y'CbCr to RGB conversion   *
*       as long as the offsets on Y, Cb, and Cr are -16, -128, and -128,    *
*       respectively, and the coefficients match the pattern shown.         *
*                                                                           *
*       The kernel implements the following matrix form, which involves 5   *
*       unique coefficients:                                                *
*                                                                           *
*           [ Y' -  16 ]   [ coeff[0] 0.0000   coeff[1] ]     [ R']         *
*           [ Cb - 128 ] * [ coeff[0] coeff[2] coeff[3] ]  =  [ G']         *
*           [ Cr - 128 ]   [ coeff[0] coeff[4] 0.0000   ]     [ B']         *
*                                                                           *
*                                                                           *
*       Below are some common coefficient sets, along with the matrix       *
*       equation that they correspond to.   Coefficients are in signed      *
*       Q13 notation, which gives a suitable balance between precision      *
*       and range.                                                          *
*                                                                           *
*       1.  Y'CbCr -> RGB conversion with RGB levels that correspond to     *
*           the 219-level range of Y'.  Expected ranges are [16..235] for   *
*           Y' and [16..240] for Cb and Cr.                                 *
*                                                                           *
*           coeff[] = { 0x2000, 0x2BDD, -0x0AC5, -0x1658, 0x3770 };         *
*                                                                           *
*           [ Y' -  16 ]   [ 1.0000    0.0000    1.3707 ]     [ R']         *
*           [ Cb - 128 ] * [ 1.0000   -0.3365   -0.6982 ]  =  [ G']         *
*           [ Cr - 128 ]   [ 1.0000    1.7324    0.0000 ]     [ B']         *
*                                                                           *
*       2.  Y'CbCr -> RGB conversion with the 219-level range of Y'         *
*           expanded to fill the full RGB dynamic range.  (The matrix has   *
*           been scaled by 255/219.)  Expected ranges are [16..235] for Y'  *
*           and [16..240] for Cb and Cr.                                    *
*                                                                           *
*           coeff[] = { 0x2543, 0x3313, -0x0C8A, -0x1A04, 0x408D };         *
*                                                                           *
*           [ Y' -  16 ]   [ 1.1644    0.0000    1.5960 ]     [ R']         *
*           [ Cb - 128 ] * [ 1.1644   -0.3918   -0.8130 ]  =  [ G']         *
*           [ Cr - 128 ]   [ 1.1644    2.0172    0.0000 ]     [ B']         *
*                                                                           *
*       Other scalings of the color differences (B'-Y') and (R'-Y')         *
*       (sometimes incorrectly referred to as U and V) are supported, as    *
*       long as the color differences are unsigned values centered around   *
*       128 rather than signed values centered around 0, as noted above.    *
*                                                                           *
*       In addition to performing plain color-space conversion, color       *
*       saturation can be adjusted by scaling coeff[1] through coeff[4].    *
*       Similarly, brightness can be adjusted by scaling coeff[0].          *
*       General hue adjustment can not be performed, however, due to the    *
*       two zeros hard-coded in the matrix.                                 *
*                                                                           *
*   TECHNIQUES                                                              *
*       Pixel replication is performed implicitly on chroma data to         *
*       reduce the total number of multiplies required.  The chroma         *
*       portion of the matrix is calculated once for each Cb, Cr pair,      *
*       and the result is added to both Y' samples.                         *
*                                                                           *
*       Luma is biased downwards to produce R, G, and B values that are     *
*       signed quantities centered around zero, rather than unsigned qtys.  *
*       This allows us to use SSHL to perform saturation, followed by a     *
*       quick XOR to correct the sign bits in the final packed pixels.      *
*       The required downward bias is 128 shifted left by the Q-point, 13.  *
*                                                                           *
*       To save two instructions, I transformed "(y0-16)*luma - (128<<13)"  *
*       to the slightly more cryptic "y0*luma - (16*luma + (128<<13))".     *
*       This gives me the non-obvious but effective y_bias value            *
*       -((128 << 13) + 16*luma).  The transformation allows me to fit in   *
*       a 6 cycle loop.                                                     *
*                                                                           *
*       Twin pointers are used for the stack and coeff[] arrays for speed.  *
*                                                                           *
*       Because the loop accesses four different arrays at three different  *
*       strides, no memory accesses are allowed to parallelize in the       *
*       loop.  No bank conflicts occur, as a result.                        *
*                                                                           *
*       Creatively constructed multiplies are used to avoid a bottleneck    *
*       on shifts in the loop.  In particular, the 5-bit mask 0xF8000000    *
*       doubles as a right-shift constant that happens to negate while      *
*       shifting.  This negation is reversed by merging the bits with a     *
*       SUB instead of an ADD or OR.                                        *
*                                                                           *
*       Prolog and epilog collapsing have been performed, with only a       *
*       partial stage of prolog and epilog left uncollapsed.  The partial   *
*       stages are interscheduled with the rest of the code for speed.      *
*                                                                           *
*       The stack pointer is saved in IRP to allow all 32 registers to      *
*       be used in the loop.  This enabled prolog collapsing by freeing     *
*       up a predicate register.  The prolog collapse counter is            *
*       implemented as a MPY which shifts a constant left by 3 bits each    *
*       iteration.  The counter is initialized from one of the other        *
*       constant registers, thereby reducing the S-unit bottleneck in the   *
*       setup code.                                                         *
*                                                                           *
*       Instructions have been scheduled to minimize fetch-packet padding   *
*       NOPs.  Only 3 padding NOPs and 1 explicit NOP remain.               *
*                                                                           *
*   ASSUMPTIONS                                                             *
*       An even number of luma samples needs to be processed.               *
*                                                                           *
*       The output image must be word aligned.                              *
*                                                                           *
*   NOTES                                                                   *
*       No bank conflicts occur.                                            *
*                                                                           *
*       Codesize is 512 bytes.                                              *
*                                                                           *
*       On average, one bank per cycle is accessed on a C6201 in the loop,  *
*       with 1 cycle of 6 accessing no banks, and 1 cycle accessing two.    *
*                                                                           *
*       The kernel requires 14 words of stack space.                        *
*                                                                           *
*   SOURCE                                                                  *
*       Poynton, Charles et al.  "The Color FAQ,"  1999.                    *
*           http://home.inforamp.net/~poynton/ColorFAQ.html                 *
*                                                                           *
* ------------------------------------------------------------------------- *
*             Copyright (c) 1999 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *
                .sect ".data:copyright_h"
_Copyright:     .string "Copyright (C) 1999 Texas Instruments Incorporated. "
                .string "All Rights Reserved.",0
                .sect ".text:hand"
                .global _yc2rgb16
_yc2rgb16:
; =============== SYMBOLIC REGISTER ASSIGNMENTS: ARGUMENTS ================ ;
        .asg            A4,         A_coef      ; Coefficients table
        .asg            B4,         B_y_data    ; Pointer to luma
        .asg            A6,         A_cb_data   ; Pointer to B-Y
        .asg            B6,         B_cr_data   ; Pointer to R-Y
        .asg            A8,         A_rgb_data  ; Pointer to RGB output
        .asg            B8,         B_num_pix   ; # of pixels to process
; ================= SYMBOLIC REGISTER ASSIGNMENTS: SETUP ================== ;
        .asg            B15,        B_SP        ; Stack pointer, B datapath
        .asg            A3,         A_SP        ; Stack pointer, A datapath
        .asg            B0,         B_csr       ; CSR's value
        .asg            B1,         B_noGIE     ; CSR w/ GIE bit cleared
        .asg            B2,         B_irp       ; IRP's value
        .asg            A0,         A_csr       ; Copy of CSR's value
        .asg            B3,         B_ret       ; Return address
        .asg            B7,         B_coef      ; Twin coefficients ptr.
        .asg            A13,        A_rcr       ; Cr's contribution to Red
        .asg            B14,        B_bcb       ; Cb's contribution to Blu
        .asg            A5,         A_gcr_      ; Cr's contribution to Grn
        .asg            A5,         A_gcr       ; A_gcr_ << 16
        .asg            B5,         B_gcb_      ; Cb's contribution to Grn
        .asg            B5,         B_gcb       ; B_gcb_ << 16
        .asg            A1,         A_lneg      ; luma coeff[0] < 0
; ================= SYMBOLIC REGISTER ASSIGNMENTS: KERNEL ================= ;
        .asg            B0,         B_p         ; Prolog collapse counter
        .asg            A2,         A_i         ; Loop trip counter
        .asg            A10,        A_y_ptr     ; Luma data pointer
        .asg            B15,        B_cb_ptr    ; B-Y data pointer
        .asg            B6,         B_cr_ptr    ; R-Y data pointer
        .asg            B11,        B_rgb_ptr   ; RGB output data pointer
        .asg            B12,        B_k32_k128  ; Constant 0x00200080
        .asg            A11,        A_k32_k128  ; Constant 0x00200080
        .asg            A12,        A_one_lum   ; Constant 1 packed w/coeff[0]
        .asg            A13,        A_gcr_rcr   ; coeff[3], coeff[1] packed
        .asg            B14,        B_gcb_bcb   ; coeff[2], coeff[4] packed
        .asg            B10,        B_y_bias    ; -((128<<13) + 16*coeff[0])
        .asg            B13,        B_ms5       ; Mask:  upper 5 bits
        .asg            A14,        A_ms6       ; Mask:  upper 6 bits
        .asg            A15,        A_sflip     ; Sign-flip const 0x84108410
        .asg            A0,         A_y0        ; y0 value from y_data[]
        .asg            B4,         B_y1        ; y1 value from y_data[]
        .asg            B1,         B_cb_       ; cb value prior to level shift
        .asg            A3,         A_cr_       ; cr value prior to level shift
        .asg            B3,         B_cb        ; level-shifted cb value.
        .asg            A4,         A_cr        ; level-shifted cr value
        .asg            B5,         B_y1t_      ; scaled y1, before level shift
        .asg            A3,         A_y0t_      ; scaled y0, before level shift
        .asg            B9,         B_y1t       ; scaled, level-shifted y1
        .asg            A5,         A_y0t       ; scaled, level-shifted y0
        .asg            B3,         B_bt        ; Scaled blue color-diff
        .asg            B1,         B_gt_       ; Scaled green color-diff (a)
        .asg            A8,         A_gt_       ; Scaled green color-diff (b)
        .asg            A6,         A_gt        ; Scaled green color-diff
        .asg            A9,         A_rt        ; Scaled red color-diff
        .asg            B1,         B_r1        ; Pixel 1 red  (16Q16)
        .asg            B3,         B_g1        ; Pixel 1 grn  (17Q15)
        .asg            B4,         B_b1        ; Pixel 1 blu  (16Q16)
        .asg            A3,         A_r0        ; Pixel 0 red  (16Q16)
        .asg            A5,         A_g0        ; Pixel 0 grn  (17Q15)
        .asg            A0,         A_b0        ; Pixel 0 blu  (16Q16)
        .asg            B5,         B_r1s       ; Saturated pixel 1 red (5Q27)
        .asg            B4,         B_g1s       ; Saturated pixel 1 grn (6Q26)
        .asg            B5,         B_b1s       ; Saturated pixel 1 blu (5Q27)
        .asg            A1,         A_r0s       ; Saturated pixel 0 red (5Q27)
        .asg            A4,         A_g0s       ; Saturated pixel 0 grn (6Q26)
        .asg            A4,         A_b0s       ; Saturated pixel 0 blu (5Q27)
        .asg            B8,         B_r1t       ; Truncated pixel 1 red
        .asg            B7,         B_g1t       ; Truncated pixel 1 grn
        .asg            B2,         B_b1t       ; Truncated pixel 1 blu
        .asg            A7,         A_r0t       ; Truncated pixel 0 red
        .asg            A4,         A_g0t       ; Truncated pixel 0 grn
        .asg            A5,         A_b0t       ; Truncated pixel 0 blu
        .asg            B2,         B_g1f       ; Pixel 1 grn in final position
        .asg            B1,         B_b1f       ; Pixel 1 blu in final position
        .asg            B8,         B_r_b1      ; Pixel 1 red, blue merged
        .asg            B4,         B_rgb1      ; Pixel 1 red, grn, blu merged
        .asg            A3,         A_g0f       ; Pixel 0 grn in final position
        .asg            A6,         A_b0f       ; Pixel 0 blu in final position
        .asg            A7,         A_r_b0      ; Pixel 0 red, blue merged
        .asg            A9,         A_rgb0_     ; Pixel 0 red, grn, blu merged
        .asg            A6,         A_rgb0      ; Pixel 0 in low half word
        .asg            B5,         B_rgb_      ; Combined pixels pre-sign-fix
        .asg            B7,         B_rgb       ; Combined pixels w/ sign-fix
; ========================================================================= ;
        ; Stack frame.  14 words:  A10..A15, B10..B14, B3, CSR, IRP
;-
        STW     .D2T1   A15,        *B_SP--[14]         ; Save A15, get stack
||      MVC     .S2     CSR,        B_csr               ; Capture CSR's state
||      MV      .L2X    A_coef,     B_coef              ; Twin coef pointer
||      MVK     .S1     0xFFFF8410, A_sflip             ; Sign-flip cst, low
        MV      .S1X    B_SP,       A_SP                ; Twin Stack Pointer
||      AND     .L2     B_csr,      -2,         B_noGIE ; Clear GIE
||      LDHU    .D1T2   *A_coef[2], B_gcb_              ; gcb = coeff[2]
||      LDHU    .D2T1   *B_coef[3], A_gcr_              ; gcb = coeff[3]
;-
        STW     .D1T1   A14,        *+A_SP[12]          ; Save A14
||      STW     .D2T2   B14,        *+B_SP[11]          ; Save B14
||      MVC     .S2     B_noGIE,    CSR                 ; Disable interrupts
||      ZERO    .L1     A_ms6                           ; Mask 6, low
; ===== Interrupts masked here =====
        STW     .D1T1   A13,        *+A_SP[10]          ; Save A13
||      STW     .D2T2   B13,        *+B_SP[ 9]          ; Save B13
||      MVC     .S2     IRP,        B_irp               ; Capture IRP's state
||      ZERO    .L2     B_ms5                           ; Mask 5, low
;-
        STW     .D1T1   A12,        *+A_SP[ 8]          ; Save A12
||      STW     .D2T2   B12,        *+B_SP[ 7]          ; Save B12
||      MVC     .S2     B_SP,       IRP                 ; Save SP in IRP
||      MVKLH   .S1     0xFC00,     A_ms6               ; Mask 6, high
        LDH     .D1T1   *A_coef[0], A_one_lum           ; lum = coeff[0]
||      MV      .L1X    B_csr,      A_csr               ; Partitioning MV
        STW     .D1T1   A11,        *+A_SP[ 6]          ; Save A11
||      STW     .D2T2   B11,        *+B_SP[ 5]          ; Save B11
;-
        LDHU    .D2T1   *B_coef[1], A_rcr               ; rcr = coeff[1]
||      LDHU    .D1T2   *A_coef[4], B_bcb               ; rcr = coeff[2]
        STW     .D1T1   A10,        *+A_SP[ 4]          ; Save A10
||      STW     .D2T2   B10,        *+B_SP[ 3]          ; Save B10
||      MV      .L1X    B_y_data,   A_y_ptr             ; Partitioning MV
        STW     .D1T1   A_csr,      *+A_SP[ 2]          ; Save CSR
||      STW     .D2T2   B_ret,      *+B_SP[ 1]          ; Save return address
||      MVK     .S2     128,        B_k32_k128          ; Constant: 128
;-
; =========================== PIPE LOOP PROLOG ============================ ;
        LDBU    .D2T1   *B_cr_ptr++,            A_cr_   ;[ 1,1] cr = *cr_ptr++
||      AND     .L1X    B_num_pix,  -2,         A_i     ; Make num_pix even
||      MV      .L2X    A_cb_data,  B_cb_ptr            ; Partitioning MV
||      MVKLH   .S1     1,          A_one_lum           ; Constant: 1
||      MVKLH   .S2     32,         B_k32_k128          ; Constant: 32
||      MPY     .M2     B_k32_k128, 1,          B_p     ; Prolog collapse count
||      MPYH    .M1     A_one_lum,  A_one_lum,  A_lneg  ; lneg = coeff[0] < 0
;-
        LDBU    .D1T1   *A_y_ptr++[2],          A_y0    ;[ 2,1] y0 = *y_ptr++
||      SHL     .S2X    A_one_lum,  4,          B_y_bias; ((128<<13)+16*luma)
||      MVKH    .S1     0x84108410, A_sflip             ; Sign-flip cst, high
        LDBU    .D2T2   *B_cb_ptr++,            B_cb_   ;[ 3,1] cb = *cb_ptr++
||      ADD     .D1     A_i,        2,          A_i     ; Adjust for para iter
||      SHL     .S1     A_lneg,     20,         A_lneg  ; Handle luma < 0
||      MV      .L1X    B_k32_k128, A_k32_k128          ; Twin constant reg.
||      MV      .L2X    A_rgb_data, B_rgb_ptr           ; Partitioning MV
;-
        LDBU    .D1T2   *-A_y_ptr[1],           B_y1    ;[ 4,1] y1 = *y_ptr++
||      SHL     .S1     A_gcr_,     16,         A_gcr   ; Put gcr in high half
||      SHL     .S2     B_gcb_,     16,         B_gcb   ; Put gcb in high half
||      SUB     .L2X    B_y_bias,   A_lneg,     B_y_bias; Sign bit, coeff[0]<0
        STW     .D1T2   B_irp,      *+A_SP[13]          ; Save IRP
||      ADD     .L1     A_gcr,      A_rcr,      A_gcr_rcr ; Merge gcr, rcr
||      ADD     .L2     B_gcb,      B_bcb,      B_gcb_bcb ; Merge gcb, rcb
||      MVKLH   .S2     0xF800,     B_ms5               ; Mask 5, high
;-
; =========================== PIPE LOOP KERNEL ============================ ;
conv_loop:
  [ A_i]B       .S1     conv_loop                       ;[24,1] while (i)
||      ADD     .L2X    B_rgb1,     A_rgb0,     B_rgb_  ;[24,1] merge pix 0, 1
||      MPYHUS  .M1X    A_g0t,      B_ms5,      A_g0f   ;[18,2] >> 5 and negate
||      SSHL    .S2     B_g1,       11,         B_g1s   ;[18,2] g1s = sat(g1)
||      ADD     .D1     A_y0t,      A_rt,       A_r0    ;[12,3] r0  = y0t + rt
||      SUB     .D2     B_y1t_,     B_y_bias,   B_y1t   ;[12,3] y1t-= y_bias
||      SUB     .L1     A_cr_,      A_k32_k128, A_cr    ;[ 6,4] cr -= 128
||      MPYUS   .M2     B_p,        8,          B_p     ; prolog collapse count
;-
        ADD     .D1     A_r0t,      A_b0f,      A_r_b0  ;[19,2] Merge r0, b0
||      MPYHU   .M2     B_b1t,      B_k32_k128, B_b1f   ;[19,2] >> 11
||      AND     .S2X    B_g1s,      A_ms6,      B_g1t   ;[19,2] g1t = g1s & ms6
||      SSHL    .S1     A_r0,       11,         A_r0s   ;[13,3] r0s = sat(r0)
||      ADD     .L2     B_y1t,      B_bt,       B_b1    ;[13,3] b1  = y1t + bt
||      ADD     .L1X    B_gt_,      A_gt_,      A_gt    ;[13,3]gt=gcr*cr+gcb*cb
||      MPYLH   .M1     A_cr,       A_gcr_rcr,  A_gt_   ;[ 7,4] gcr *c r
||      LDBU    .D2T1   *B_cr_ptr++,            A_cr_   ;[ 1,5] cr  = *cr_ptr++
;-
        XOR     .L2X    B_rgb_,     A_sflip,    B_rgb   ;[26,1] Fix sign bits
||      MPYHUS  .M2     B_g1t,      B_ms5,      B_g1f   ;[20,2] >> 5 and negate
||      SSHL    .S2     B_b1,       11,         B_b1s   ;[14,3] b1s = sat(b1)
||      ADD     .L1X    A_y0t,      B_bt,       A_b0    ;[14,3] b0  = y0t + bt
||      ADD     .S1     A_y0t,      A_gt,       A_g0    ;[14,3] g0  = y0t + gt
||      MPY     .M1     A_y0,       A_one_lum,  A_y0t_  ;[ 8,4] y0t = y0 * luma
||      SUB     .D2     B_cb_,      B_k32_k128, B_cb    ;[ 8,4] cb -= 128
||      LDBU    .D1T1   *A_y_ptr++[2],          A_y0    ;[ 2,5] y0  = *y_ptr++
;-
        SUB     .D1     A_r_b0,     A_g0f,      A_rgb0_ ;[21,2] merge r0,g0,b0
||      ADD     .L2     B_r1t,      B_b1f,      B_r_b1  ;[21,2] merge r1, b1
||      AND     .L1X    A_r0s,      B_ms5,      A_r0t   ;[15,3] r0s = r0t & ms5
||      SSHL    .S1     A_b0,       11,         A_b0s   ;[15,3] b0s = sat(b0)
||      ADD     .S2X    B_y1t,      A_rt,       B_r1    ;[15,3] r1  = y1t + rt
||      MPY     .M1     A_cr,       A_gcr_rcr,  A_rt    ;[ 9,4] rt  = rcr * cr
||      MPYLH   .M2     B_cb,       B_gcb_bcb,  B_gt_   ;[ 9,4] gcb * cb
||      LDBU    .D2T2   *B_cb_ptr++,            B_cb_   ;[ 3,5] cb  = *cb_ptr++
;-
        MPYHU   .M1     A_rgb0_,    A_one_lum,  A_rgb0  ;[22,2] rgb0 in lo half
||      SUB     .D2     B_r_b1,     B_g1f,      B_rgb1  ;[22,2] merge r1,g1,b1
||      AND     .L1X    A_b0s,      B_ms5,      A_b0t   ;[16,3] b0t = b0s & ms5
||      AND     .L2     B_b1s,      B_ms5,      B_b1t   ;[16,3] b1t = b1s & ms5
||      SSHL    .S1     A_g0,       11,         A_g0s   ;[16,3] g0s = sat(g0)
||      SSHL    .S2     B_r1,       11,         B_r1s   ;[16,3] r1s = sat(r1)
||      MPY     .M2X    B_y1,       A_one_lum,  B_y1t_  ;[10,4] y1t = y1 * luma
||      LDBU    .D1T2   *-A_y_ptr[1],           B_y1    ;[ 4,5] y1  = *y_ptr++
;-
  [!B_p]STW     .D2T2   B_rgb,      *B_rgb_ptr++        ;[29,1] *rgb_ptr++=rgb
||      SUB     .D1     A_i,        2,          A_i     ;[23,2] i -= 2
||      MPYHU   .M1     A_b0t,      A_k32_k128, A_b0f   ;[17,3] >> 11
||      AND     .L1     A_g0s,      A_ms6,      A_g0t   ;[17,3] g0t = g0s & ms6
||      AND     .L2     B_r1s,      B_ms5,      B_r1t   ;[17,3] r1t = r1s & ms5
||      ADD     .S2X    B_y1t,      A_gt,       B_g1    ;[17,3] g1  = y1t + gt
||      MPY     .M2     B_cb,       B_gcb_bcb,  B_bt    ;[11,4] bt  = bcb * cb
||      SUB     .S1X    A_y0t_,     B_y_bias,   A_y0t   ;[11,4] y0t-= y_bias
; =========================== PIPE LOOP EPILOG ============================ ;
; ================ SYMBOLIC REGISTER ASSIGNMENTS: CLEANUP ================= ;
        .asg            B15,        B_SP                ; Stack ptr, B side
        .asg            A3,         A_SP                ; Stack ptr, A side
        .asg            A0,         A_csr               ; CSR value
        .asg            B0,         B_irp               ; IRP value
        .asg            B3,         B_ret               ; Return address
; ========================================================================= ;
;-
        MVC     .S2     IRP,        B_SP                ; Restore stack ptr
||      ADD     .L2X    B_rgb1,     A_rgb0,     B_rgb_  ;[24,5] merge pix 0, 1
        MV      .L1X    B_SP,       A_SP                ; Twin Stack Pointer
||      LDW     .D2T2   *+B_SP[13], B_irp               ; Get IRP's value
        LDW     .D1T2   *+A_SP[ 1], B_ret               ; Get return address
||      LDW     .D2T1   *+B_SP[ 2], A_csr               ; Get CSR's value
        LDW     .D1T2   *+A_SP[ 3], B10                 ; Restore B10
||      LDW     .D2T1   *+B_SP[ 4], A10                 ; Restore A10
;-
        LDW     .D1T2   *+A_SP[11], B14                 ; Restore B14
||      LDW     .D2T1   *+B_SP[12], A14                 ; Restore A14
||      XOR     .L2X    B_rgb_,     A_sflip,    B_rgb   ;[26,5] fix sign bits
        LDW     .D1T2   *+A_SP[ 7], B12                 ; Restore B12
||      LDW     .D2T1   *+B_SP[ 8], A12                 ; Restore A12
        LDW     .D1T2   *+A_SP[ 9], B13                 ; Restore B13
||      LDW     .D2T1   *+B_SP[10], A13                 ; Restore A13
||      MVC     .S2     B_irp,      IRP                 ; Restore IRP
;-
        LDW     .D1T2   *+A_SP[ 5], B11                 ; Restore B11
||      LDW     .D2T1   *+B_SP[ 6], A11                 ; Restore A11
||      B       .S2     B_ret                           ; Return to caller
        MVC     .S2X    A_csr,      CSR                 ; Restore CSR
||      LDW     .D2T1   *++B_SP[14],A15                 ; Restore A15
; ===== Interruptibility state (GIE) restored here =====
        STW     .D2T2   B_rgb,      *B_rgb_ptr          ;[29,5] *rgb_ptr++=rgb
        NOP             3
; ===== Branch occurs =====
; ===== Interrupts may occur here =====
* ========================================================================= *
*   End of file:  ycbcr422pl_to_rgb565_h.asm                                *
* ------------------------------------------------------------------------- *
*             Copyright (c) 1999 Texas Instruments, Incorporated.           *
*                            All Rights Reserved.                           *
* ========================================================================= *