Windows CE

开发平台：
C/C++

dsputil_iwmmxt_rnd.h：源码内容
							void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    int stride = line_size;
    __asm__ __volatile__ (
        "and r12, %[pixels], #7 nt"
        "bic %[pixels], %[pixels], #7 nt"
        "tmcr wcgr1, r12 nt"
        "add r4, %[pixels], %[line_size] nt"
        "add r5, %[block], %[line_size] nt"
        "mov %[line_size], %[line_size], lsl #1 nt"
        "1: nt"
        "wldrd wr0, [%[pixels]] nt"
        "subs %[h], %[h], #2 nt"
        "wldrd wr1, [%[pixels], #8] nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "wldrd wr3, [r4] nt"
        "pld [%[pixels]] nt"
        "pld [%[pixels], #32] nt"
        "wldrd wr4, [r4, #8] nt"
        "add r4, r4, %[line_size] nt"
        "walignr1 wr8, wr0, wr1 nt"
        "pld [r4] nt"
        "pld [r4, #32] nt"
        "walignr1 wr10, wr3, wr4 nt"
        "wstrd wr8, [%[block]] nt"
        "add %[block], %[block], %[line_size] nt"
        "wstrd wr10, [r5] nt"
        "add r5, r5, %[line_size] nt"
        "bne 1b nt"
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
        :
        : "memory", "r4", "r5", "r12");
}
void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    int stride = line_size;
    __asm__ __volatile__ (
        "and r12, %[pixels], #7 nt"
        "bic %[pixels], %[pixels], #7 nt"
        "tmcr wcgr1, r12 nt"
        "add r4, %[pixels], %[line_size] nt"
        "add r5, %[block], %[line_size] nt"
        "mov %[line_size], %[line_size], lsl #1 nt"
        "1: nt"
        "wldrd wr0, [%[pixels]] nt"
        "subs %[h], %[h], #2 nt"
        "wldrd wr1, [%[pixels], #8] nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "wldrd wr3, [r4] nt"
        "pld [%[pixels]] nt"
        "pld [%[pixels], #32] nt"
        "wldrd wr4, [r4, #8] nt"
        "add r4, r4, %[line_size] nt"
        "walignr1 wr8, wr0, wr1 nt"
        "wldrd wr0, [%[block]] nt"
        "wldrd wr2, [r5] nt"
        "pld [r4] nt"
        "pld [r4, #32] nt"
        "walignr1 wr10, wr3, wr4 nt"
        WAVG2B" wr8, wr8, wr0 nt"
        WAVG2B" wr10, wr10, wr2 nt"
        "wstrd wr8, [%[block]] nt"
        "add %[block], %[block], %[line_size] nt"
        "wstrd wr10, [r5] nt"
        "pld [%[block]] nt"
        "pld [%[block], #32] nt"
        "add r5, r5, %[line_size] nt"
        "pld [r5] nt"
        "pld [r5, #32] nt"
        "bne 1b nt"
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
        :
        : "memory", "r4", "r5", "r12");
}
void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    int stride = line_size;
    __asm__ __volatile__ (
        "and r12, %[pixels], #7 nt"
        "bic %[pixels], %[pixels], #7 nt"
        "tmcr wcgr1, r12 nt"
        "add r4, %[pixels], %[line_size] nt"
        "add r5, %[block], %[line_size] nt"
        "mov %[line_size], %[line_size], lsl #1 nt"
        "1: nt"
        "wldrd wr0, [%[pixels]] nt"
        "wldrd wr1, [%[pixels], #8] nt"
        "subs %[h], %[h], #2 nt"
        "wldrd wr2, [%[pixels], #16] nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "wldrd wr3, [r4] nt"
        "pld [%[pixels]] nt"
        "pld [%[pixels], #32] nt"
        "walignr1 wr8, wr0, wr1 nt"
        "wldrd wr4, [r4, #8] nt"
        "walignr1 wr9, wr1, wr2 nt"
        "wldrd wr5, [r4, #16] nt"
        "add r4, r4, %[line_size] nt"
        "pld [r4] nt"
        "pld [r4, #32] nt"
        "walignr1 wr10, wr3, wr4 nt"
        "wstrd wr8, [%[block]] nt"
        "walignr1 wr11, wr4, wr5 nt"
        "wstrd wr9, [%[block], #8] nt"
        "add %[block], %[block], %[line_size] nt"
        "wstrd wr10, [r5] nt"
        "wstrd wr11, [r5, #8] nt"
        "add r5, r5, %[line_size] nt"
        "bne 1b nt"
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
        :
        : "memory", "r4", "r5", "r12");
}
void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    int stride = line_size;
    __asm__ __volatile__ (
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "pld [%[block]]                 nt"
        "pld [%[block], #32]            nt"
        "and r12, %[pixels], #7         nt"
        "bic %[pixels], %[pixels], #7   nt"
        "tmcr wcgr1, r12                nt"
        "add r4, %[pixels], %[line_size]nt"
        "add r5, %[block], %[line_size] nt"
        "mov %[line_size], %[line_size], lsl #1 nt"
        "1:                             nt"
        "wldrd wr0, [%[pixels]]         nt"
        "wldrd wr1, [%[pixels], #8]     nt"
        "subs %[h], %[h], #2            nt"
        "wldrd wr2, [%[pixels], #16]    nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "wldrd wr3, [r4]                nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr8, wr0, wr1         nt"
        "wldrd wr4, [r4, #8]            nt"
        "walignr1 wr9, wr1, wr2         nt"
        "wldrd wr5, [r4, #16]           nt"
        "add r4, r4, %[line_size]       nt"
        "wldrd wr0, [%[block]]          nt"
        "pld [r4]                       nt"
        "wldrd wr1, [%[block], #8]      nt"
        "pld [r4, #32]                  nt"
        "wldrd wr2, [r5]                nt"
        "walignr1 wr10, wr3, wr4        nt"
        "wldrd wr3, [r5, #8]            nt"
        WAVG2B" wr8, wr8, wr0           nt"
        WAVG2B" wr9, wr9, wr1           nt"
        WAVG2B" wr10, wr10, wr2         nt"
        "wstrd wr8, [%[block]]          nt"
        "walignr1 wr11, wr4, wr5        nt"
        WAVG2B" wr11, wr11, wr3         nt"
        "wstrd wr9, [%[block], #8]      nt"
        "add %[block], %[block], %[line_size] nt"
        "wstrd wr10, [r5]               nt"
        "pld [%[block]]                 nt"
        "pld [%[block], #32]            nt"
        "wstrd wr11, [r5, #8]           nt"
        "add r5, r5, %[line_size]       nt"
        "pld [r5]                       nt"
        "pld [r5, #32]                  nt"
        "bne 1b nt"
        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
        :
        : "memory", "r4", "r5", "r12");
}
void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    int stride = line_size;
    // [wr0 wr1 wr2 wr3] for previous line
    // [wr4 wr5 wr6 wr7] for current line
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
    __asm__ __volatile__(
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "and r12, %[pixels], #7         nt"
        "bic %[pixels], %[pixels], #7   nt"
        "tmcr wcgr1, r12                nt"
        "add r12, r12, #1               nt"
        "add r4, %[pixels], %[line_size]nt"
        "tmcr wcgr2, r12                nt"
        "add r5, %[block], %[line_size] nt"
        "mov %[line_size], %[line_size], lsl #1 nt"
        "1:                             nt"
        "wldrd wr10, [%[pixels]]        nt"
        "cmp r12, #8                    nt"
        "wldrd wr11, [%[pixels], #8]    nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "wldrd wr13, [r4]               nt"
        "pld [%[pixels]]                nt"
        "wldrd wr14, [r4, #8]           nt"
        "pld [%[pixels], #32]           nt"
        "add r4, r4, %[line_size]       nt"
        "walignr1 wr0, wr10, wr11       nt"
        "pld [r4]                       nt"
        "pld [r4, #32]                  nt"
        "walignr1 wr2, wr13, wr14       nt"
        "wmoveq wr4, wr11               nt"
        "wmoveq wr6, wr14               nt"
        "walignr2ne wr4, wr10, wr11     nt"
        "walignr2ne wr6, wr13, wr14     nt"
        WAVG2B" wr0, wr0, wr4           nt"
        WAVG2B" wr2, wr2, wr6           nt"
        "wstrd wr0, [%[block]]          nt"
        "subs %[h], %[h], #2            nt"
        "wstrd wr2, [r5]                nt"
        "add %[block], %[block], %[line_size]   nt"
        "add r5, r5, %[line_size]       nt"
        "bne 1b                         nt"
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
        :
        : "r4", "r5", "r12", "memory");
}
void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    int stride = line_size;
    // [wr0 wr1 wr2 wr3] for previous line
    // [wr4 wr5 wr6 wr7] for current line
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
    __asm__ __volatile__(
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "and r12, %[pixels], #7         nt"
        "bic %[pixels], %[pixels], #7   nt"
        "tmcr wcgr1, r12                nt"
        "add r12, r12, #1               nt"
        "add r4, %[pixels], %[line_size]nt"
        "tmcr wcgr2, r12                nt"
        "add r5, %[block], %[line_size] nt"
        "mov %[line_size], %[line_size], lsl #1 nt"
        "1:                             nt"
        "wldrd wr10, [%[pixels]]        nt"
        "cmp r12, #8                    nt"
        "wldrd wr11, [%[pixels], #8]    nt"
        "wldrd wr12, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "wldrd wr13, [r4]               nt"
        "pld [%[pixels]]                nt"
        "wldrd wr14, [r4, #8]           nt"
        "pld [%[pixels], #32]           nt"
        "wldrd wr15, [r4, #16]          nt"
        "add r4, r4, %[line_size]       nt"
        "walignr1 wr0, wr10, wr11       nt"
        "pld [r4]                       nt"
        "pld [r4, #32]                  nt"
        "walignr1 wr1, wr11, wr12       nt"
        "walignr1 wr2, wr13, wr14       nt"
        "walignr1 wr3, wr14, wr15       nt"
        "wmoveq wr4, wr11               nt"
        "wmoveq wr5, wr12               nt"
        "wmoveq wr6, wr14               nt"
        "wmoveq wr7, wr15               nt"
        "walignr2ne wr4, wr10, wr11     nt"
        "walignr2ne wr5, wr11, wr12     nt"
        "walignr2ne wr6, wr13, wr14     nt"
        "walignr2ne wr7, wr14, wr15     nt"
        WAVG2B" wr0, wr0, wr4           nt"
        WAVG2B" wr1, wr1, wr5           nt"
        "wstrd wr0, [%[block]]          nt"
        WAVG2B" wr2, wr2, wr6           nt"
        "wstrd wr1, [%[block], #8]      nt"
        WAVG2B" wr3, wr3, wr7           nt"
        "add %[block], %[block], %[line_size]   nt"
        "wstrd wr2, [r5]                nt"
        "subs %[h], %[h], #2            nt"
        "wstrd wr3, [r5, #8]            nt"
        "add r5, r5, %[line_size]       nt"
        "bne 1b                         nt"
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
        :
        : "r4", "r5", "r12", "memory");
}
void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    int stride = line_size;
    // [wr0 wr1 wr2 wr3] for previous line
    // [wr4 wr5 wr6 wr7] for current line
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
    __asm__ __volatile__(
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "pld [%[block]]                 nt"
        "pld [%[block], #32]            nt"
        "and r12, %[pixels], #7         nt"
        "bic %[pixels], %[pixels], #7   nt"
        "tmcr wcgr1, r12                nt"
        "add r12, r12, #1               nt"
        "add r4, %[pixels], %[line_size]nt"
        "tmcr wcgr2, r12                nt"
        "add r5, %[block], %[line_size] nt"
        "mov %[line_size], %[line_size], lsl #1 nt"
        "pld [r5]                       nt"
        "pld [r5, #32]                  nt"
        "1:                             nt"
        "wldrd wr10, [%[pixels]]        nt"
        "cmp r12, #8                    nt"
        "wldrd wr11, [%[pixels], #8]    nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "wldrd wr13, [r4]               nt"
        "pld [%[pixels]]                nt"
        "wldrd wr14, [r4, #8]           nt"
        "pld [%[pixels], #32]           nt"
        "add r4, r4, %[line_size]       nt"
        "walignr1 wr0, wr10, wr11       nt"
        "pld [r4]                       nt"
        "pld [r4, #32]                  nt"
        "walignr1 wr2, wr13, wr14       nt"
        "wmoveq wr4, wr11               nt"
        "wmoveq wr6, wr14               nt"
        "walignr2ne wr4, wr10, wr11     nt"
        "wldrd wr10, [%[block]]         nt"
        "walignr2ne wr6, wr13, wr14     nt"
        "wldrd wr12, [r5]               nt"
        WAVG2B" wr0, wr0, wr4           nt"
        WAVG2B" wr2, wr2, wr6           nt"
        WAVG2B" wr0, wr0, wr10          nt"
        WAVG2B" wr2, wr2, wr12          nt"
        "wstrd wr0, [%[block]]          nt"
        "subs %[h], %[h], #2            nt"
        "wstrd wr2, [r5]                nt"
        "add %[block], %[block], %[line_size]   nt"
        "add r5, r5, %[line_size]       nt"
        "pld [%[block]]                 nt"
        "pld [%[block], #32]            nt"
        "pld [r5]                       nt"
        "pld [r5, #32]                  nt"
        "bne 1b                         nt"
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
        :
        : "r4", "r5", "r12", "memory");
}
void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    int stride = line_size;
    // [wr0 wr1 wr2 wr3] for previous line
    // [wr4 wr5 wr6 wr7] for current line
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
    __asm__ __volatile__(
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "pld [%[block]]                 nt"
        "pld [%[block], #32]            nt"
        "and r12, %[pixels], #7         nt"
        "bic %[pixels], %[pixels], #7   nt"
        "tmcr wcgr1, r12                nt"
        "add r12, r12, #1               nt"
        "add r4, %[pixels], %[line_size]nt"
        "tmcr wcgr2, r12                nt"
        "add r5, %[block], %[line_size] nt"
        "mov %[line_size], %[line_size], lsl #1 nt"
        "pld [r5]                       nt"
        "pld [r5, #32]                  nt"
        "1:                             nt"
        "wldrd wr10, [%[pixels]]        nt"
        "cmp r12, #8                    nt"
        "wldrd wr11, [%[pixels], #8]    nt"
        "wldrd wr12, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "wldrd wr13, [r4]               nt"
        "pld [%[pixels]]                nt"
        "wldrd wr14, [r4, #8]           nt"
        "pld [%[pixels], #32]           nt"
        "wldrd wr15, [r4, #16]          nt"
        "add r4, r4, %[line_size]       nt"
        "walignr1 wr0, wr10, wr11       nt"
        "pld [r4]                       nt"
        "pld [r4, #32]                  nt"
        "walignr1 wr1, wr11, wr12       nt"
        "walignr1 wr2, wr13, wr14       nt"
        "walignr1 wr3, wr14, wr15       nt"
        "wmoveq wr4, wr11               nt"
        "wmoveq wr5, wr12               nt"
        "wmoveq wr6, wr14               nt"
        "wmoveq wr7, wr15               nt"
        "walignr2ne wr4, wr10, wr11     nt"
        "walignr2ne wr5, wr11, wr12     nt"
        "walignr2ne wr6, wr13, wr14     nt"
        "walignr2ne wr7, wr14, wr15     nt"
        "wldrd wr10, [%[block]]         nt"
        WAVG2B" wr0, wr0, wr4           nt"
        "wldrd wr11, [%[block], #8]     nt"
        WAVG2B" wr1, wr1, wr5           nt"
        "wldrd wr12, [r5]               nt"
        WAVG2B" wr2, wr2, wr6           nt"
        "wldrd wr13, [r5, #8]           nt"
        WAVG2B" wr3, wr3, wr7           nt"
        WAVG2B" wr0, wr0, wr10          nt"
        WAVG2B" wr1, wr1, wr11          nt"
        WAVG2B" wr2, wr2, wr12          nt"
        WAVG2B" wr3, wr3, wr13          nt"
        "wstrd wr0, [%[block]]          nt"
        "subs %[h], %[h], #2            nt"
        "wstrd wr1, [%[block], #8]      nt"
        "add %[block], %[block], %[line_size]   nt"
        "wstrd wr2, [r5]                nt"
        "pld [%[block]]                 nt"
        "wstrd wr3, [r5, #8]            nt"
        "add r5, r5, %[line_size]       nt"
        "pld [%[block], #32]            nt"
        "pld [r5]                       nt"
        "pld [r5, #32]                  nt"
        "bne 1b                         nt"
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
        :
        :"r4", "r5", "r12", "memory");
}
void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    int stride = line_size;
    // [wr0 wr1 wr2 wr3] for previous line
    // [wr4 wr5 wr6 wr7] for current line
    __asm__ __volatile__(
        "pld            [%[pixels]]                             nt"
        "pld            [%[pixels], #32]                        nt"
        "and            r12, %[pixels], #7                      nt"
        "tmcr           wcgr1, r12                              nt"
        "bic            %[pixels], %[pixels], #7                nt"
        "wldrd          wr10, [%[pixels]]                       nt"
        "wldrd          wr11, [%[pixels], #8]                   nt"
        "pld            [%[block]]                              nt"
        "add            %[pixels], %[pixels], %[line_size]      nt"
        "walignr1       wr0, wr10, wr11                         nt"
        "pld            [%[pixels]]                             nt"
        "pld            [%[pixels], #32]                        nt"
      "1:                                                       nt"
        "wldrd          wr10, [%[pixels]]                       nt"
        "wldrd          wr11, [%[pixels], #8]                   nt"
        "add            %[pixels], %[pixels], %[line_size]      nt"
        "pld            [%[pixels]]                             nt"
        "pld            [%[pixels], #32]                        nt"
        "walignr1       wr4, wr10, wr11                         nt"
        "wldrd          wr10, [%[block]]                        nt"
         WAVG2B"        wr8, wr0, wr4                           nt"
         WAVG2B"        wr8, wr8, wr10                          nt"
        "wstrd          wr8, [%[block]]                         nt"
        "add            %[block], %[block], %[line_size]        nt"
        "wldrd          wr10, [%[pixels]]                       nt"
        "wldrd          wr11, [%[pixels], #8]                   nt"
        "pld            [%[block]]                              nt"
        "add            %[pixels], %[pixels], %[line_size]      nt"
        "pld            [%[pixels]]                             nt"
        "pld            [%[pixels], #32]                        nt"
        "walignr1       wr0, wr10, wr11                         nt"
        "wldrd          wr10, [%[block]]                        nt"
         WAVG2B"        wr8, wr0, wr4                           nt"
         WAVG2B"        wr8, wr8, wr10                          nt"
        "wstrd          wr8, [%[block]]                         nt"
        "add            %[block], %[block], %[line_size]        nt"
        "subs           %[h], %[h], #2                          nt"
        "pld            [%[block]]                              nt"
        "bne            1b                                      nt"
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
        :
        : "cc", "memory", "r12");
}
void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    int stride = line_size;
    // [wr0 wr1 wr2 wr3] for previous line
    // [wr4 wr5 wr6 wr7] for current line
    __asm__ __volatile__(
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "and r12, %[pixels], #7         nt"
        "tmcr wcgr1, r12                nt"
        "bic %[pixels], %[pixels], #7   nt"
        "wldrd wr10, [%[pixels]]        nt"
        "wldrd wr11, [%[pixels], #8]    nt"
        "wldrd wr12, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr0, wr10, wr11       nt"
        "walignr1 wr1, wr11, wr12       nt"
        "1:                             nt"
        "wldrd wr10, [%[pixels]]        nt"
        "wldrd wr11, [%[pixels], #8]    nt"
        "wldrd wr12, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr4, wr10, wr11       nt"
        "walignr1 wr5, wr11, wr12       nt"
        WAVG2B" wr8, wr0, wr4           nt"
        WAVG2B" wr9, wr1, wr5           nt"
        "wstrd wr8, [%[block]]          nt"
        "wstrd wr9, [%[block], #8]      nt"
        "add %[block], %[block], %[line_size]   nt"
        "wldrd wr10, [%[pixels]]        nt"
        "wldrd wr11, [%[pixels], #8]    nt"
        "wldrd wr12, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr0, wr10, wr11       nt"
        "walignr1 wr1, wr11, wr12       nt"
        WAVG2B" wr8, wr0, wr4           nt"
        WAVG2B" wr9, wr1, wr5           nt"
        "wstrd wr8, [%[block]]          nt"
        "wstrd wr9, [%[block], #8]      nt"
        "add %[block], %[block], %[line_size]   nt"
        "subs %[h], %[h], #2            nt"
        "bne 1b                         nt"
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
        :
        : "r4", "r5", "r12", "memory");
}
void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    int stride = line_size;
    // [wr0 wr1 wr2 wr3] for previous line
    // [wr4 wr5 wr6 wr7] for current line
    __asm__ __volatile__(
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "and r12, %[pixels], #7         nt"
        "tmcr wcgr1, r12                nt"
        "bic %[pixels], %[pixels], #7   nt"
        "wldrd wr10, [%[pixels]]        nt"
        "wldrd wr11, [%[pixels], #8]    nt"
        "pld [%[block]]                 nt"
        "wldrd wr12, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr0, wr10, wr11       nt"
        "walignr1 wr1, wr11, wr12       nt"
        "1:                             nt"
        "wldrd wr10, [%[pixels]]        nt"
        "wldrd wr11, [%[pixels], #8]    nt"
        "wldrd wr12, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr4, wr10, wr11       nt"
        "walignr1 wr5, wr11, wr12       nt"
        "wldrd wr10, [%[block]]         nt"
        "wldrd wr11, [%[block], #8]     nt"
        WAVG2B" wr8, wr0, wr4           nt"
        WAVG2B" wr9, wr1, wr5           nt"
        WAVG2B" wr8, wr8, wr10          nt"
        WAVG2B" wr9, wr9, wr11          nt"
        "wstrd wr8, [%[block]]          nt"
        "wstrd wr9, [%[block], #8]      nt"
        "add %[block], %[block], %[line_size]   nt"
        "wldrd wr10, [%[pixels]]        nt"
        "wldrd wr11, [%[pixels], #8]    nt"
        "pld [%[block]]                 nt"
        "wldrd wr12, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr0, wr10, wr11       nt"
        "walignr1 wr1, wr11, wr12       nt"
        "wldrd wr10, [%[block]]         nt"
        "wldrd wr11, [%[block], #8]     nt"
        WAVG2B" wr8, wr0, wr4           nt"
        WAVG2B" wr9, wr1, wr5           nt"
        WAVG2B" wr8, wr8, wr10          nt"
        WAVG2B" wr9, wr9, wr11          nt"
        "wstrd wr8, [%[block]]          nt"
        "wstrd wr9, [%[block], #8]      nt"
        "add %[block], %[block], %[line_size]   nt"
        "subs %[h], %[h], #2            nt"
        "pld [%[block]]                 nt"
        "bne 1b                         nt"
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
        :
        : "r4", "r5", "r12", "memory");
}
void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    // [wr0 wr1 wr2 wr3] for previous line
    // [wr4 wr5 wr6 wr7] for current line
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
    __asm__ __volatile__(
        "pld [%[pixels]]                nt"
        "mov r12, #2                    nt"
        "pld [%[pixels], #32]           nt"
        "tmcr wcgr0, r12                nt" /* for shift value */
        "and r12, %[pixels], #7         nt"
        "bic %[pixels], %[pixels], #7   nt"
        "tmcr wcgr1, r12                nt"
        // [wr0 wr1 wr2 wr3] <= *
        // [wr4 wr5 wr6 wr7]
        "wldrd wr12, [%[pixels]]        nt"
        "add r12, r12, #1               nt"
        "wldrd wr13, [%[pixels], #8]    nt"
        "tmcr wcgr2, r12                nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "cmp r12, #8                    nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr2, wr12, wr13       nt"
        "wmoveq wr10, wr13              nt"
        "walignr2ne wr10, wr12, wr13    nt"
        "wunpckelub wr0, wr2            nt"
        "wunpckehub wr1, wr2            nt"
        "wunpckelub wr8, wr10           nt"
        "wunpckehub wr9, wr10           nt"
        "waddhus wr0, wr0, wr8          nt"
        "waddhus wr1, wr1, wr9          nt"
        "1:                             nt"
        // [wr0 wr1 wr2 wr3]
        // [wr4 wr5 wr6 wr7] <= *
        "wldrd wr12, [%[pixels]]        nt"
        "cmp r12, #8                    nt"
        "wldrd wr13, [%[pixels], #8]    nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "walignr1 wr6, wr12, wr13       nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "wmoveq wr10, wr13              nt"
        "walignr2ne wr10, wr12, wr13    nt"
        "wunpckelub wr4, wr6            nt"
        "wunpckehub wr5, wr6            nt"
        "wunpckelub wr8, wr10           nt"
        "wunpckehub wr9, wr10           nt"
        "waddhus wr4, wr4, wr8          nt"
        "waddhus wr5, wr5, wr9          nt"
        "waddhus wr8, wr0, wr4          nt"
        "waddhus wr9, wr1, wr5          nt"
        "waddhus wr8, wr8, wr15         nt"
        "waddhus wr9, wr9, wr15         nt"
        "wsrlhg wr8, wr8, wcgr0         nt"
        "wsrlhg wr9, wr9, wcgr0         nt"
        "wpackhus wr8, wr8, wr9         nt"
        "wstrd wr8, [%[block]]          nt"
        "add %[block], %[block], %[line_size]   nt"
        // [wr0 wr1 wr2 wr3] <= *
        // [wr4 wr5 wr6 wr7]
        "wldrd wr12, [%[pixels]]        nt"
        "wldrd wr13, [%[pixels], #8]    nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "walignr1 wr2, wr12, wr13       nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "wmoveq wr10, wr13              nt"
        "walignr2ne wr10, wr12, wr13    nt"
        "wunpckelub wr0, wr2            nt"
        "wunpckehub wr1, wr2            nt"
        "wunpckelub wr8, wr10           nt"
        "wunpckehub wr9, wr10           nt"
        "waddhus wr0, wr0, wr8          nt"
        "waddhus wr1, wr1, wr9          nt"
        "waddhus wr8, wr0, wr4          nt"
        "waddhus wr9, wr1, wr5          nt"
        "waddhus wr8, wr8, wr15         nt"
        "waddhus wr9, wr9, wr15         nt"
        "wsrlhg wr8, wr8, wcgr0         nt"
        "wsrlhg wr9, wr9, wcgr0         nt"
        "wpackhus wr8, wr8, wr9         nt"
        "subs %[h], %[h], #2            nt"
        "wstrd wr8, [%[block]]          nt"
        "add %[block], %[block], %[line_size]   nt"
        "bne 1b                         nt"
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
        : [line_size]"r"(line_size)
        : "r12", "memory");
}
void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    // [wr0 wr1 wr2 wr3] for previous line
    // [wr4 wr5 wr6 wr7] for current line
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
    __asm__ __volatile__(
        "pld [%[pixels]]                nt"
        "mov r12, #2                    nt"
        "pld [%[pixels], #32]           nt"
        "tmcr wcgr0, r12                nt" /* for shift value */
        /* alignment */
        "and r12, %[pixels], #7         nt"
        "bic %[pixels], %[pixels], #7   nt"
        "tmcr wcgr1, r12                nt"
        "add r12, r12, #1               nt"
        "tmcr wcgr2, r12                nt"
        // [wr0 wr1 wr2 wr3] <= *
        // [wr4 wr5 wr6 wr7]
        "wldrd wr12, [%[pixels]]        nt"
        "cmp r12, #8                    nt"
        "wldrd wr13, [%[pixels], #8]    nt"
        "wldrd wr14, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "pld [%[pixels]]                nt"
        "walignr1 wr2, wr12, wr13       nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr3, wr13, wr14       nt"
        "wmoveq wr10, wr13              nt"
        "wmoveq wr11, wr14              nt"
        "walignr2ne wr10, wr12, wr13    nt"
        "walignr2ne wr11, wr13, wr14    nt"
        "wunpckelub wr0, wr2            nt"
        "wunpckehub wr1, wr2            nt"
        "wunpckelub wr2, wr3            nt"
        "wunpckehub wr3, wr3            nt"
        "wunpckelub wr8, wr10           nt"
        "wunpckehub wr9, wr10           nt"
        "wunpckelub wr10, wr11          nt"
        "wunpckehub wr11, wr11          nt"
        "waddhus wr0, wr0, wr8          nt"
        "waddhus wr1, wr1, wr9          nt"
        "waddhus wr2, wr2, wr10         nt"
        "waddhus wr3, wr3, wr11         nt"
        "1:                             nt"
        // [wr0 wr1 wr2 wr3]
        // [wr4 wr5 wr6 wr7] <= *
        "wldrd wr12, [%[pixels]]        nt"
        "cmp r12, #8                    nt"
        "wldrd wr13, [%[pixels], #8]    nt"
        "wldrd wr14, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "walignr1 wr6, wr12, wr13       nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr7, wr13, wr14       nt"
        "wmoveq wr10, wr13              nt"
        "wmoveq wr11, wr14              nt"
        "walignr2ne wr10, wr12, wr13    nt"
        "walignr2ne wr11, wr13, wr14    nt"
        "wunpckelub wr4, wr6            nt"
        "wunpckehub wr5, wr6            nt"
        "wunpckelub wr6, wr7            nt"
        "wunpckehub wr7, wr7            nt"
        "wunpckelub wr8, wr10           nt"
        "wunpckehub wr9, wr10           nt"
        "wunpckelub wr10, wr11          nt"
        "wunpckehub wr11, wr11          nt"
        "waddhus wr4, wr4, wr8          nt"
        "waddhus wr5, wr5, wr9          nt"
        "waddhus wr6, wr6, wr10         nt"
        "waddhus wr7, wr7, wr11         nt"
        "waddhus wr8, wr0, wr4          nt"
        "waddhus wr9, wr1, wr5          nt"
        "waddhus wr10, wr2, wr6         nt"
        "waddhus wr11, wr3, wr7         nt"
        "waddhus wr8, wr8, wr15         nt"
        "waddhus wr9, wr9, wr15         nt"
        "waddhus wr10, wr10, wr15       nt"
        "waddhus wr11, wr11, wr15       nt"
        "wsrlhg wr8, wr8, wcgr0         nt"
        "wsrlhg wr9, wr9, wcgr0         nt"
        "wsrlhg wr10, wr10, wcgr0       nt"
        "wsrlhg wr11, wr11, wcgr0       nt"
        "wpackhus wr8, wr8, wr9         nt"
        "wpackhus wr9, wr10, wr11       nt"
        "wstrd wr8, [%[block]]          nt"
        "wstrd wr9, [%[block], #8]      nt"
        "add %[block], %[block], %[line_size]   nt"
        // [wr0 wr1 wr2 wr3] <= *
        // [wr4 wr5 wr6 wr7]
        "wldrd wr12, [%[pixels]]        nt"
        "wldrd wr13, [%[pixels], #8]    nt"
        "wldrd wr14, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "walignr1 wr2, wr12, wr13       nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr3, wr13, wr14       nt"
        "wmoveq wr10, wr13              nt"
        "wmoveq wr11, wr14              nt"
        "walignr2ne wr10, wr12, wr13    nt"
        "walignr2ne wr11, wr13, wr14    nt"
        "wunpckelub wr0, wr2            nt"
        "wunpckehub wr1, wr2            nt"
        "wunpckelub wr2, wr3            nt"
        "wunpckehub wr3, wr3            nt"
        "wunpckelub wr8, wr10           nt"
        "wunpckehub wr9, wr10           nt"
        "wunpckelub wr10, wr11          nt"
        "wunpckehub wr11, wr11          nt"
        "waddhus wr0, wr0, wr8          nt"
        "waddhus wr1, wr1, wr9          nt"
        "waddhus wr2, wr2, wr10         nt"
        "waddhus wr3, wr3, wr11         nt"
        "waddhus wr8, wr0, wr4          nt"
        "waddhus wr9, wr1, wr5          nt"
        "waddhus wr10, wr2, wr6         nt"
        "waddhus wr11, wr3, wr7         nt"
        "waddhus wr8, wr8, wr15         nt"
        "waddhus wr9, wr9, wr15         nt"
        "waddhus wr10, wr10, wr15       nt"
        "waddhus wr11, wr11, wr15       nt"
        "wsrlhg wr8, wr8, wcgr0         nt"
        "wsrlhg wr9, wr9, wcgr0         nt"
        "wsrlhg wr10, wr10, wcgr0       nt"
        "wsrlhg wr11, wr11, wcgr0       nt"
        "wpackhus wr8, wr8, wr9         nt"
        "wpackhus wr9, wr10, wr11       nt"
        "wstrd wr8, [%[block]]          nt"
        "wstrd wr9, [%[block], #8]      nt"
        "add %[block], %[block], %[line_size]   nt"
        "subs %[h], %[h], #2            nt"
        "bne 1b                         nt"
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
        : [line_size]"r"(line_size)
        : "r12", "memory");
}
void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    // [wr0 wr1 wr2 wr3] for previous line
    // [wr4 wr5 wr6 wr7] for current line
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
    __asm__ __volatile__(
        "pld [%[block]]                 nt"
        "pld [%[block], #32]            nt"
        "pld [%[pixels]]                nt"
        "mov r12, #2                    nt"
        "pld [%[pixels], #32]           nt"
        "tmcr wcgr0, r12                nt" /* for shift value */
        "and r12, %[pixels], #7         nt"
        "bic %[pixels], %[pixels], #7   nt"
        "tmcr wcgr1, r12                nt"
        // [wr0 wr1 wr2 wr3] <= *
        // [wr4 wr5 wr6 wr7]
        "wldrd wr12, [%[pixels]]        nt"
        "add r12, r12, #1               nt"
        "wldrd wr13, [%[pixels], #8]    nt"
        "tmcr wcgr2, r12                nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "cmp r12, #8                    nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr2, wr12, wr13       nt"
        "wmoveq wr10, wr13              nt"
        "walignr2ne wr10, wr12, wr13    nt"
        "wunpckelub wr0, wr2            nt"
        "wunpckehub wr1, wr2            nt"
        "wunpckelub wr8, wr10           nt"
        "wunpckehub wr9, wr10           nt"
        "waddhus wr0, wr0, wr8          nt"
        "waddhus wr1, wr1, wr9          nt"
        "1:                             nt"
        // [wr0 wr1 wr2 wr3]
        // [wr4 wr5 wr6 wr7] <= *
        "wldrd wr12, [%[pixels]]        nt"
        "cmp r12, #8                    nt"
        "wldrd wr13, [%[pixels], #8]    nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "walignr1 wr6, wr12, wr13       nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "wmoveq wr10, wr13              nt"
        "walignr2ne wr10, wr12, wr13    nt"
        "wunpckelub wr4, wr6            nt"
        "wunpckehub wr5, wr6            nt"
        "wunpckelub wr8, wr10           nt"
        "wunpckehub wr9, wr10           nt"
        "waddhus wr4, wr4, wr8          nt"
        "waddhus wr5, wr5, wr9          nt"
        "waddhus wr8, wr0, wr4          nt"
        "waddhus wr9, wr1, wr5          nt"
        "waddhus wr8, wr8, wr15         nt"
        "waddhus wr9, wr9, wr15         nt"
        "wldrd wr12, [%[block]]         nt"
        "wsrlhg wr8, wr8, wcgr0         nt"
        "wsrlhg wr9, wr9, wcgr0         nt"
        "wpackhus wr8, wr8, wr9         nt"
        WAVG2B" wr8, wr8, wr12          nt"
        "wstrd wr8, [%[block]]          nt"
        "add %[block], %[block], %[line_size]   nt"
        "wldrd wr12, [%[pixels]]        nt"
        "pld [%[block]]                 nt"
        "pld [%[block], #32]            nt"
        // [wr0 wr1 wr2 wr3] <= *
        // [wr4 wr5 wr6 wr7]
        "wldrd wr13, [%[pixels], #8]    nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "walignr1 wr2, wr12, wr13       nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "wmoveq wr10, wr13              nt"
        "walignr2ne wr10, wr12, wr13    nt"
        "wunpckelub wr0, wr2            nt"
        "wunpckehub wr1, wr2            nt"
        "wunpckelub wr8, wr10           nt"
        "wunpckehub wr9, wr10           nt"
        "waddhus wr0, wr0, wr8          nt"
        "waddhus wr1, wr1, wr9          nt"
        "waddhus wr8, wr0, wr4          nt"
        "waddhus wr9, wr1, wr5          nt"
        "waddhus wr8, wr8, wr15         nt"
        "waddhus wr9, wr9, wr15         nt"
        "wldrd wr12, [%[block]]         nt"
        "wsrlhg wr8, wr8, wcgr0         nt"
        "wsrlhg wr9, wr9, wcgr0         nt"
        "wpackhus wr8, wr8, wr9         nt"
        "subs %[h], %[h], #2            nt"
        WAVG2B" wr8, wr8, wr12          nt"
        "wstrd wr8, [%[block]]          nt"
        "add %[block], %[block], %[line_size]   nt"
        "pld [%[block]]                 nt"
        "pld [%[block], #32]            nt"
        "bne 1b                         nt"
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
        : [line_size]"r"(line_size)
        : "r12", "memory");
}
void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
{
    // [wr0 wr1 wr2 wr3] for previous line
    // [wr4 wr5 wr6 wr7] for current line
    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
    __asm__ __volatile__(
        "pld [%[block]]                 nt"
        "pld [%[block], #32]            nt"
        "pld [%[pixels]]                nt"
        "mov r12, #2                    nt"
        "pld [%[pixels], #32]           nt"
        "tmcr wcgr0, r12                nt" /* for shift value */
        /* alignment */
        "and r12, %[pixels], #7         nt"
        "bic %[pixels], %[pixels], #7           nt"
        "tmcr wcgr1, r12                nt"
        "add r12, r12, #1               nt"
        "tmcr wcgr2, r12                nt"
        // [wr0 wr1 wr2 wr3] <= *
        // [wr4 wr5 wr6 wr7]
        "wldrd wr12, [%[pixels]]        nt"
        "cmp r12, #8                    nt"
        "wldrd wr13, [%[pixels], #8]    nt"
        "wldrd wr14, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "pld [%[pixels]]                nt"
        "walignr1 wr2, wr12, wr13       nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr3, wr13, wr14       nt"
        "wmoveq wr10, wr13              nt"
        "wmoveq wr11, wr14              nt"
        "walignr2ne wr10, wr12, wr13    nt"
        "walignr2ne wr11, wr13, wr14    nt"
        "wunpckelub wr0, wr2            nt"
        "wunpckehub wr1, wr2            nt"
        "wunpckelub wr2, wr3            nt"
        "wunpckehub wr3, wr3            nt"
        "wunpckelub wr8, wr10           nt"
        "wunpckehub wr9, wr10           nt"
        "wunpckelub wr10, wr11          nt"
        "wunpckehub wr11, wr11          nt"
        "waddhus wr0, wr0, wr8          nt"
        "waddhus wr1, wr1, wr9          nt"
        "waddhus wr2, wr2, wr10         nt"
        "waddhus wr3, wr3, wr11         nt"
        "1:                             nt"
        // [wr0 wr1 wr2 wr3]
        // [wr4 wr5 wr6 wr7] <= *
        "wldrd wr12, [%[pixels]]        nt"
        "cmp r12, #8                    nt"
        "wldrd wr13, [%[pixels], #8]    nt"
        "wldrd wr14, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "walignr1 wr6, wr12, wr13       nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr7, wr13, wr14       nt"
        "wmoveq wr10, wr13              nt"
        "wmoveq wr11, wr14              nt"
        "walignr2ne wr10, wr12, wr13    nt"
        "walignr2ne wr11, wr13, wr14    nt"
        "wunpckelub wr4, wr6            nt"
        "wunpckehub wr5, wr6            nt"
        "wunpckelub wr6, wr7            nt"
        "wunpckehub wr7, wr7            nt"
        "wunpckelub wr8, wr10           nt"
        "wunpckehub wr9, wr10           nt"
        "wunpckelub wr10, wr11          nt"
        "wunpckehub wr11, wr11          nt"
        "waddhus wr4, wr4, wr8          nt"
        "waddhus wr5, wr5, wr9          nt"
        "waddhus wr6, wr6, wr10         nt"
        "waddhus wr7, wr7, wr11         nt"
        "waddhus wr8, wr0, wr4          nt"
        "waddhus wr9, wr1, wr5          nt"
        "waddhus wr10, wr2, wr6         nt"
        "waddhus wr11, wr3, wr7         nt"
        "waddhus wr8, wr8, wr15         nt"
        "waddhus wr9, wr9, wr15         nt"
        "waddhus wr10, wr10, wr15       nt"
        "waddhus wr11, wr11, wr15       nt"
        "wsrlhg wr8, wr8, wcgr0         nt"
        "wsrlhg wr9, wr9, wcgr0         nt"
        "wldrd wr12, [%[block]]         nt"
        "wldrd wr13, [%[block], #8]     nt"
        "wsrlhg wr10, wr10, wcgr0       nt"
        "wsrlhg wr11, wr11, wcgr0       nt"
        "wpackhus wr8, wr8, wr9         nt"
        "wpackhus wr9, wr10, wr11       nt"
        WAVG2B" wr8, wr8, wr12          nt"
        WAVG2B" wr9, wr9, wr13          nt"
        "wstrd wr8, [%[block]]          nt"
        "wstrd wr9, [%[block], #8]      nt"
        "add %[block], %[block], %[line_size]   nt"
        // [wr0 wr1 wr2 wr3] <= *
        // [wr4 wr5 wr6 wr7]
        "wldrd wr12, [%[pixels]]        nt"
        "pld [%[block]]                 nt"
        "wldrd wr13, [%[pixels], #8]    nt"
        "pld [%[block], #32]            nt"
        "wldrd wr14, [%[pixels], #16]   nt"
        "add %[pixels], %[pixels], %[line_size] nt"
        "walignr1 wr2, wr12, wr13       nt"
        "pld [%[pixels]]                nt"
        "pld [%[pixels], #32]           nt"
        "walignr1 wr3, wr13, wr14       nt"
        "wmoveq wr10, wr13              nt"
        "wmoveq wr11, wr14              nt"
        "walignr2ne wr10, wr12, wr13    nt"
        "walignr2ne wr11, wr13, wr14    nt"
        "wunpckelub wr0, wr2            nt"
        "wunpckehub wr1, wr2            nt"
        "wunpckelub wr2, wr3            nt"
        "wunpckehub wr3, wr3            nt"
        "wunpckelub wr8, wr10           nt"
        "wunpckehub wr9, wr10           nt"
        "wunpckelub wr10, wr11          nt"
        "wunpckehub wr11, wr11          nt"
        "waddhus wr0, wr0, wr8          nt"
        "waddhus wr1, wr1, wr9          nt"
        "waddhus wr2, wr2, wr10         nt"
        "waddhus wr3, wr3, wr11         nt"
        "waddhus wr8, wr0, wr4          nt"
        "waddhus wr9, wr1, wr5          nt"
        "waddhus wr10, wr2, wr6         nt"
        "waddhus wr11, wr3, wr7         nt"
        "waddhus wr8, wr8, wr15         nt"
        "waddhus wr9, wr9, wr15         nt"
        "waddhus wr10, wr10, wr15       nt"
        "waddhus wr11, wr11, wr15       nt"
        "wsrlhg wr8, wr8, wcgr0         nt"
        "wsrlhg wr9, wr9, wcgr0         nt"
        "wldrd wr12, [%[block]]         nt"
        "wldrd wr13, [%[block], #8]     nt"
        "wsrlhg wr10, wr10, wcgr0       nt"
        "wsrlhg wr11, wr11, wcgr0       nt"
        "wpackhus wr8, wr8, wr9         nt"
        "wpackhus wr9, wr10, wr11       nt"
        WAVG2B" wr8, wr8, wr12          nt"
        WAVG2B" wr9, wr9, wr13          nt"
        "wstrd wr8, [%[block]]          nt"
        "wstrd wr9, [%[block], #8]      nt"
        "add %[block], %[block], %[line_size]   nt"
        "subs %[h], %[h], #2            nt"
        "pld [%[block]]                 nt"
        "pld [%[block], #32]            nt"
        "bne 1b                         nt"
        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
        : [line_size]"r"(line_size)
        : "r12", "memory");
}