yuv12-rgb16.s
上传用户:aoeyumen
上传日期:2007-01-06
资源大小:3329k
文件大小:19k
- ;-------------------------------------------------------------------------
- ; cxm12161 -- This function performs YUV12-to-RGB16 color conversion for H26x.
- ; It handles any format in which there are three fields, the low
- ; order field being B and fully contained in the low order byte, the
- ; second field being G and being somewhere in bits 4 through 11,
- ; and the high order field being R and fully contained in the high
- ; order byte.
- ;
- ; The YUV12 input is planar, 8 bits per pel. The Y plane may have
- ; a pitch of up to 768. It may have a width less than or equal
- ; to the pitch. It must be DWORD aligned, and preferably QWORD
- ; aligned. Pitch and Width must be a multiple of four. For best
- ; performance, Pitch should not be 4 more than a multiple of 32.
- ; Height may be any amount, but must be a multiple of two. The U
- ; and V planes may have a different pitch than the Y plane, subject
- ; to the same limitations.
- ;
- ;include iammx.inc
- ;include locals.inc
- RGB_formats:
- dd RGB565
- dd RGB555
- dd RGB664
- dd RGB655
- Minusg: dd 00800080h, 00800080h
- Yadd: dd 10101010h, 10101010h
- VtR: dd 00660066h, 00660066h ;01990199h,01990199h
- VtG: dd 00340034h, 00340034h ;00d000d0h,00d000d0h
- UtG: dd 00190019h, 00190019h ;00640064h,00640064h
- UtB: dd 00810081h, 00810081h ;02050205h,02050205h
- Ymul: dd 004a004ah, 004a004ah ;012a012ah,012a012ah
- UVtG: dd 00340019h, 00340019h ;00d00064h,00d00064h
- VtRUtB: dd 01990205h, 01990205h
- fourbitu: dd 0f0f0f0f0h, 0f0f0f0f0h
- fivebitu: dd 0e0e0e0e0h, 0e0e0e0e0h
- sixbitu: dd 0c0c0c0c0h, 0c0c0c0c0h
- %assign LocalFrameSize 156
- %assign RegisterStorageSize 16
- ; Arguments:
- %assign YPlane LocalFrameSize + RegisterStorageSize + 4
- %assign UPlane LocalFrameSize + RegisterStorageSize + 8
- %assign VPlane LocalFrameSize + RegisterStorageSize + 12
- %assign FrameWidth LocalFrameSize + RegisterStorageSize + 16
- %assign FrameHeight LocalFrameSize + RegisterStorageSize + 20
- %assign YPitch LocalFrameSize + RegisterStorageSize + 24
- %assign ChromaPitch LocalFrameSize + RegisterStorageSize + 28
- %assign AspectAdjustmentCount LocalFrameSize + RegisterStorageSize + 32
- %assign ColorConvertedFrame LocalFrameSize + RegisterStorageSize + 36
- %assign DCIOffset LocalFrameSize + RegisterStorageSize + 40
- %assign CCOffsetToLine0 LocalFrameSize + RegisterStorageSize + 44
- %assign CCOPitch LocalFrameSize + RegisterStorageSize + 48
- %assign CCType LocalFrameSize + RegisterStorageSize + 52
- %assign EndOfArgList LocalFrameSize + RegisterStorageSize + 56
- ; Locals (on local stack frame)
- %assign CCOCursor 0
- %assign CCOSkipDistance 4
- %assign ChromaLineLen 8
- %assign YCursor 12
- %assign DistanceFromVToU 16
- %assign EndOfChromaLine 20
- %assign AspectCount 24
- %assign AspectBaseCount 28
- %assign tmpYCursorEven 32
- %assign tmpYCursorOdd 36
- %assign tmpCCOPitch 40
- %assign temp_mmx 44 ; note it is 48 bytes
- %assign RLeftShift 92
- %assign GLeftShift 100
- %assign RRightShift 108
- %assign GRightShift 116
- %assign BRightShift 124
- %assign RUpperLimit 132
- %assign GUpperLimit 140
- %assign BUpperLimit 148
- ; extern void C MMX_YUV12ToRGB16 (
- ; U8* YPlane,
- ; U8* UPlane,
- ; U8* VPlane,
- ; UN FrameWidth,
- ; UN FrameHeight,
- ; UN YPitch,
- ; UN VPitch,
- ; UN AspectAdjustmentCount,
- ; U8* ColorConvertedFrame,
- ; U32 DCIOffset,
- ; U32 CCOffsetToLine0,
- ; IN CCOPitch,
- ; IN CCType)
- ;
- ; The local variables are on the stack,
- ; The tables are in the one and only data segment.
- ;
- ; CCOffsetToLine0 is relative to ColorConvertedFrame.
- ; CCType used by RGB color convertors to determine the exact conversion type.
- ; RGB565 = 0
- ; RGB555 = 1
- ; RGB664 = 2
- ; RGB655 = 3
- global yuv_2_rgb
- yuv_2_rgb:
- push esi
- push edi
- push ebp
- push ebx
- sub esp, LocalFrameSize
- mov eax, [esp+CCType]
- cmp eax,4
- jae near finish
- jmp [RGB_formats+eax*4]
- RGB555:
- xor eax, eax
- mov ebx, 2 ; 10-8 for byte shift
- mov [esp+RLeftShift], ebx
- mov [esp+RLeftShift+4], eax
- mov ebx, 5
- mov [esp+GLeftShift], ebx
- mov [esp+GLeftShift+4], eax
- mov ebx, 9
- mov [esp+RRightShift], ebx
- mov [esp+RRightShift+4], eax
- mov [esp+GRightShift], ebx
- mov [esp+GRightShift+4], eax
- mov [esp+BRightShift], ebx
- mov [esp+BRightShift+4], eax
- movq mm0, [fivebitu]
- movq [esp+RUpperLimit], mm0
- movq [esp+GUpperLimit], mm0
- movq [esp+BUpperLimit], mm0
- jmp RGBEND
- RGB664:
- xor eax, eax
- mov ebx, 2 ; 8-6
- mov [esp+RLeftShift], ebx
- mov [esp+RLeftShift+4], eax
- mov ebx, 4
- mov [esp+GLeftShift], ebx
- mov [esp+GLeftShift+4], eax
- mov ebx, 8
- mov [esp+RRightShift], ebx
- mov [esp+RRightShift+4], eax
- mov [esp+GRightShift], ebx
- mov [esp+GRightShift+4], eax
- mov ebx, 10
- mov [esp+BRightShift], ebx
- mov [esp+BRightShift+4], eax
- movq mm0, [sixbitu]
- movq [esp+RUpperLimit], mm0
- movq [esp+GUpperLimit], mm0
- movq mm0, [fourbitu]
- movq [esp+BUpperLimit], mm0
- jmp RGBEND
- RGB655:
- xor eax, eax
- mov ebx, 2 ; 8-6
- mov [esp+RLeftShift], ebx
- mov [esp+RLeftShift+4], eax
- mov ebx, 5
- mov [esp+GLeftShift], ebx
- mov [esp+GLeftShift+4], eax
- mov ebx, 8
- mov [esp+RRightShift], ebx
- mov [esp+RRightShift+4], eax
- mov ebx, 9
- mov [esp+GRightShift], ebx
- mov [esp+GRightShift+4], eax
- mov [esp+BRightShift], ebx
- mov [esp+BRightShift+4], eax
- movq mm0, [sixbitu]
- movq [esp+RUpperLimit], mm0
- movq mm0, [fivebitu]
- movq [esp+GUpperLimit], mm0
- movq [esp+BUpperLimit], mm0
- jmp RGBEND
- RGB565:
- xor eax, eax
- mov ebx, 3 ; 8-5
- mov [esp+RLeftShift], ebx
- mov [esp+RLeftShift+4], eax
- mov ebx, 5
- mov [esp+GLeftShift], ebx
- mov [esp+GLeftShift+4], eax
- mov ebx, 9
- mov [esp+RRightShift], ebx
- mov [esp+RRightShift+4], eax
- mov [esp+BRightShift], ebx
- mov [esp+BRightShift+4], eax
- mov ebx, 8
- mov [esp+GRightShift], ebx
- mov [esp+GRightShift+4], eax
- movq mm0, [fivebitu]
- movq [esp+RUpperLimit], mm0
- movq [esp+BUpperLimit], mm0
- movq mm0, [sixbitu]
- movq [esp+GUpperLimit], mm0
- ; jmp RGBEND
- RGBEND:
- mov ebx, [esp+VPlane]
- mov ecx, [esp+UPlane]
- sub ecx, ebx
- mov [esp+DistanceFromVToU], ecx
- mov eax, [esp+ColorConvertedFrame]
- add eax, [esp+DCIOffset]
- add eax, [esp+CCOffsetToLine0]
- mov [esp+CCOCursor], eax
- mov ecx,[esp+YPitch]
- mov ebx,[esp+FrameWidth]
- mov eax,[esp+CCOPitch]
- sub eax, ebx ; CCOPitch-FrameWidth
- sub eax, ebx ; CCOPitch-2*FrameWidth
- sar ebx, 1 ; FrameWidth/2
- mov esi,[esp+YPlane] ; Fetch cursor over luma plane.
- mov [esp+ChromaLineLen],ebx ; FrameWidth/2
- mov [esp+ CCOSkipDistance],eax ; CCOPitch-3*FrameWidth
- mov [esp+YCursor],esi
- mov edx,[esp+AspectAdjustmentCount]
- mov esi,[esp+VPlane]
- cmp edx,1
- je near finish
- mov [esp+AspectCount],edx
- mov [esp+AspectBaseCount],edx
- xor eax, eax
- mov edi,[esp+ChromaLineLen]
- mov [esp+EndOfChromaLine],edi
- mov edi,[esp+CCOCursor]
- mov edx,[esp+DistanceFromVToU]
- mov ebp,[esp+YCursor] ; Fetch Y Pitch.
- mov ebx,[esp+FrameWidth]
- add ebp, ebx
- mov [esp+tmpYCursorEven],ebp
- mov eax,[esp+YPitch]
- add ebp, eax
- mov [esp+tmpYCursorOdd],ebp
- sar ebx, 1
- add esi, ebx
- add edx, esi
- neg ebx
- mov [esp+FrameWidth],ebx
- ; Register Usage:
- ;
- ;------------------------------------------------------------------------------
- PrepareChromaLine:
- mov ebp,[esp+AspectCount]
- mov ebx,[esp+FrameWidth]
- sub ebp,2
- mov eax,[esp+CCOPitch]
- mov [esp+tmpCCOPitch],eax
- ja continue
- xor eax,eax
- add ebp,[esp+AspectAdjustmentCount]
- mov [esp+tmpCCOPitch],eax
- continue:
- mov [esp+AspectCount],ebp
- do_next_8x2_block:
- mov ebp,[esp+tmpYCursorEven]
- ; here is even line
- movd mm1, [edx+ebx] ; 4 u values
- pxor mm0, mm0 ; mm0=0
- movd mm2, [esi+ebx] ; 4 v values
- punpcklbw mm1, mm0 ; get 4 unsign u
- psubw mm1, [Minusg] ; get 4 unsign u-128
- punpcklbw mm2, mm0 ; get unsign v
- psubw mm2, [Minusg] ; get unsign v-128
- movq mm3, mm1 ; save the u-128 unsign
- movq mm5, mm1 ; save u-128 unsign
- punpcklwd mm1, mm2 ; get 2 low u, v unsign pairs
- pmaddwd mm1, [UVtG]
- punpckhwd mm3, mm2 ; create high 2 unsign uv pairs
- pmaddwd mm3, [UVtG]
- movq [temp_mmx+esp], mm2 ; save v-128
- movq mm6, [ebp+2*ebx] ; mm6 has 8 y pixels
- psubusb mm6, [Yadd] ; mm6 has 8 y-16 pixels
- packssdw mm1, mm3 ; packed the results to signed words
- movq mm7, mm6 ; save the 8 y-16 pixels
- punpcklbw mm6, mm0 ; mm6 has 4 low y-16 unsign
- pmullw mm6, [Ymul]
- punpckhbw mm7, mm0 ; mm7 has 4 high y-16 unsign
- pmullw mm7, [Ymul]
- movq mm4, mm1
- movq [temp_mmx+esp+8], mm1 ; save 4 chroma G values
- punpcklwd mm1, mm1 ; chroma G replicate low 2
- movq mm0, mm6 ; low y
- punpckhwd mm4, mm4 ; chroma G replicate high 2
- movq mm3, mm7 ; high y
- psubw mm6, mm1 ; 4 low G
- psraw mm6, [esp+GRightShift]
- psubw mm7, mm4 ; 4 high G values in signed 16 bit
- movq mm2, mm5
- punpcklwd mm5, mm5 ; replicate the 2 low u pixels
- pmullw mm5, [UtB]
- punpckhwd mm2, mm2
- psraw mm7, [esp+GRightShift]
- pmullw mm2, [UtB]
- packuswb mm6, mm7 ; mm6: G7 G6 G5 G4 G3 G2 G1 G0
- movq [temp_mmx+esp+16], mm5 ; low chroma B
- paddw mm5, mm0 ; 4 low B values in signed 16 bit
- movq [temp_mmx+esp+40], mm2 ; high chroma B
- paddw mm2, mm3 ; 4 high B values in signed 16 bit
- psraw mm5, [esp+BRightShift] ; low B scaled down by 6+(8-5)
- psraw mm2, [esp+BRightShift] ; high B scaled down by 6+(8-5)
- packuswb mm5, mm2 ; mm5: B7 B6 B5 B4 B3 B2 B1 B0
- movq mm2, [temp_mmx+esp] ; 4 v values
- movq mm1, mm5 ; save B
- movq mm7, mm2
- punpcklwd mm2, mm2 ; replicate the 2 low v pixels
- pmullw mm2, [VtR]
- punpckhwd mm7, mm7
- pmullw mm7, [VtR]
- paddusb mm1, [esp+BUpperLimit] ; mm1: saturate B+0FF-15
- movq [temp_mmx+esp+24], mm2 ; low chroma R
- paddw mm2, mm0 ; 4 low R values in signed 16 bit
- psraw mm2, [esp+RRightShift] ; low R scaled down by 6+(8-5)
- pxor mm4, mm4 ; mm4=0 for 8->16 conversion
- movq [temp_mmx+esp+32], mm7 ; high chroma R
- paddw mm7, mm3 ; 4 high R values in signed 16 bit
- psraw mm7, [esp+RRightShift] ; high R scaled down by 6+(8-5)
- psubusb mm1, [esp+BUpperLimit]
- packuswb mm2, mm7 ; mm2: R7 R6 R5 R4 R3 R2 R1 R0
- paddusb mm6, [esp+GUpperLimit] ; G fast patch ih
- psubusb mm6, [esp+GUpperLimit] ; fast patch ih
- paddusb mm2, [esp+RUpperLimit] ; R
- psubusb mm2, [esp+RUpperLimit]
- ; here we are packing from RGB24 to RGB16
- ; input:
- ; mm6: G7 G6 G5 G4 G3 G2 G1 G0
- ; mm1: B7 B6 B5 B4 B3 B2 B1 B0
- ; mm2: R7 R6 R5 R4 R3 R2 R1 R0
- ; assuming 8 original pixels in 0-H representation on mm6, mm5, mm2
- ; when H=2**xBITS-1 (x is for R G B)
- ; output:
- ; mm1- result: 4 low RGB16
- ; mm7- result: 4 high RGB16
- ; using: mm0- zero register
- ; mm3- temporary results
- ; algorithm:
- ; for (i=0; i<8; i++) {
- ; RGB[i]=256*(R[i]<<(8-5))+(G[i]<<5)+B[i];
- ; }
- psllq mm2, [esp+RLeftShift] ; position R in the most significant part of the byte
- movq mm7, mm1 ; mm1: Save B
- ; note: no need for shift to place B on the least significant part of the byte
- ; R in left position, B in the right position so they can be combined
- punpcklbw mm1, mm2 ; mm1: 4 low 16 bit RB
- pxor mm0, mm0 ; mm0: 0
- punpckhbw mm7, mm2 ; mm5: 4 high 16 bit RB
- movq mm3, mm6 ; mm3: G
- punpcklbw mm6, mm0 ; mm6: low 4 G 16 bit
- psllw mm6, [esp+GLeftShift] ; shift low G 5 positions
- punpckhbw mm3, mm0 ; mm3: high 4 G 16 bit
- por mm1, mm6 ; mm1: low RBG16
- psllw mm3, [esp+GLeftShift] ; shift high G 5 positions
- por mm7, mm3 ; mm5: high RBG16
- mov ebp,[esp+tmpYCursorOdd] ; moved to here to save cycles before odd line
- movq [edi], mm1 ; !! aligned
- ;- start odd line
- movq mm1, [ebp+2*ebx] ; mm1 has 8 y pixels
- pxor mm2, mm2
- psubusb mm1, [Yadd] ; mm1 has 8 pixels y-16
- movq mm5, mm1
- punpcklbw mm1, mm2 ; get 4 low y-16 unsign pixels word
- pmullw mm1, [Ymul] ; low 4 luminance contribution
- punpckhbw mm5, mm2 ; 4 high y-16
- pmullw mm5, [Ymul] ; high 4 luminance contribution
- movq [edi+8], mm7 ; !! aligned
- movq mm0, mm1
- paddw mm0, [temp_mmx+esp+24] ; low 4 R
- movq mm6, mm5
- psraw mm0, [esp+RRightShift] ; low R scaled down by 6+(8-5)
- paddw mm5, [temp_mmx+esp+32] ; high 4 R
- movq mm2, mm1
- psraw mm5, [esp+RRightShift] ; high R scaled down by 6+(8-5)
- paddw mm2, [temp_mmx+esp+16] ; low 4 B
- packuswb mm0, mm5 ; mm0: R7 R6 R5 R4 R3 R2 R1 R0
- psraw mm2, [esp+BRightShift] ; low B scaled down by 6+(8-5)
- movq mm5, mm6
- paddw mm6, [temp_mmx+esp+40] ; high 4 B
- psraw mm6, [esp+BRightShift] ; high B scaled down by 6+(8-5)
- movq mm3, [temp_mmx+esp+8] ; chroma G low 4
- packuswb mm2, mm6 ; mm2: B7 B6 B5 B4 B3 B2 B1 B0
- movq mm4, mm3
- punpcklwd mm3, mm3 ; replicate low 2
- punpckhwd mm4, mm4 ; replicate high 2
- psubw mm1, mm3 ; 4 low G
- psraw mm1, [esp+GRightShift] ; low G scaled down by 6+(8-5)
- psubw mm5, mm4 ; 4 high G values in signed 16 bit
- psraw mm5, [esp+GRightShift] ; high G scaled down by 6+(8-5)
- paddusb mm2, [esp+BUpperLimit] ; mm1: saturate B+0FF-15
- packuswb mm1, mm5 ; mm1: G7 G6 G5 G4 G3 G2 G1 G0
- psubusb mm2, [esp+BUpperLimit]
- paddusb mm1, [esp+GUpperLimit] ; G
- psubusb mm1, [esp+GUpperLimit]
- paddusb mm0, [esp+RUpperLimit] ; R
- mov eax,[esp+tmpCCOPitch]
- psubusb mm0, [esp+RUpperLimit]
- ; here we are packing from RGB24 to RGB16
- ; mm1: G7 G6 G5 G4 G3 G2 G1 G0
- ; mm2: B7 B6 B5 B4 B3 B2 B1 B0
- ; mm0: R7 R6 R5 R4 R3 R2 R1 R0
- ; output:
- ; mm2- result: 4 low RGB16
- ; mm7- result: 4 high RGB16
- ; using: mm4- zero register
- ; mm3- temporary results
- psllq mm0, [esp+RLeftShift] ; position R in the most significant part of the byte
- movq mm7, mm2 ; mm7: Save B
- ; note: no need for shift to place B on the least significant part of the byte
- ; R in left position, B in the right position so they can be combined
- punpcklbw mm2, mm0 ; mm1: 4 low 16 bit RB
- pxor mm4, mm4 ; mm4: 0
- movq mm3, mm1 ; mm3: G
- punpckhbw mm7, mm0 ; mm7: 4 high 16 bit RB
- punpcklbw mm1, mm4 ; mm1: low 4 G 16 bit
- punpckhbw mm3, mm4 ; mm3: high 4 G 16 bit
- psllw mm1, [esp+GLeftShift] ; shift low G 5 positions
- por mm2, mm1 ; mm2: low RBG16
- psllw mm3, [esp+GLeftShift] ; shift high G 5 positions
- por mm7, mm3 ; mm7: high RBG16
- movq [edi+eax], mm2
- movq [edi+eax+8], mm7 ; aligned
- add edi, 16 ; ih take 16 bytes (8 pixels-16 bit)
- add ebx, 4 ; ? to take 4 pixels together instead of 2
- jl near do_next_8x2_block ; ? update the loop for 8 y pixels at once
- add edi,[esp+CCOSkipDistance] ; go to begin of next line
- add edi,[esp+tmpCCOPitch] ; skip odd line (if it is needed)
- ; Leax AspectCount
- ; Lebp CCOPitch ; skip odd line
- ; sub eax, 2
- ; jg @f
- ; Addeax AspectBaseCount
- ; xor ebp, ebp
- ;@@:
- ; Seax AspectCount
- ; add edi, ebp
- mov eax,[esp+YPitch]
- mov ebp,[esp+tmpYCursorOdd]
- add ebp, eax ; skip one line
- ; lea ebp, [ebp+2*eax] ; skip two lines
- mov [esp+tmpYCursorEven],ebp
- ; Sebp tmpYCursorOdd
- add ebp, eax ; skip one line
- mov [esp+tmpYCursorOdd],ebp
- ; Lebp tmpYCursorEven
- ; lea ebp, [ebp+2*eax]
- ; Sebp tmpYCursorEven
- add esi,[esp+ChromaPitch]
- add edx,[esp+ChromaPitch]
- ; Leax YLimit ; Done with last line?
- ; cmp ebp, eax
- ; jbe PrepareChromaLine
- sub word [esp+FrameHeight],2
- ja near PrepareChromaLine
- ;------------------------------------------------------------------------------
- finish:
- emms
- add esp, LocalFrameSize
- pop ebx
- pop ebp
- pop edi
- pop esi
- ret