/**************************************************************************
 *                                                                        *
 *  idct.c, inverse fast Discrete Cosine Transform
 *
 *		Re-Written and Optimized by James Leiterman  May, 2001.
 *
 * This code has been developed by John Funnell. This software is an      *
 * implementation of a part of one or more MPEG-4 Video tools as          *
 * specified in ISO/IEC 14496-2 standard.  Those intending to use this    *
 * software module in hardware or software products are advised that its  *
 * use may infringe existing patents or copyrights, and any such use      *
 * would be at such party's own risk.  The original developer of this     *
 * software module and his/her company, and subsequent editors and their  *
 * companies (including Project Mayo), will have no liability for use of  *
 * this software or modifications or derivatives thereof.                 *
 *                                                                        *
 * Project Mayo gives users of the Codec a license to this software       *
 * module or modifications thereof for use in hardware or software        *
 * products claiming conformance to the MPEG-4 Video Standard as          *
 * described in the Open DivX license.                                    *
 *                                                                        *
 * The complete Open DivX license can be found at                         *
 * http://www.projectmayo.com/opendivx/license.php                        *
 *                                                                        *
 **************************************************************************/
/**
*  Copyright (C) 2001 - Project Mayo
 *
 * John Funnell 
 * Andrea Graziani
 *
 * DivX Advanced Research Center <darc@projectmayo.com>
*
*	Apr 2001 - J.Leiterman - Converted C to PowerPC Assembly.
**/
// yuv2rgb.c //


#include <inttypes.h>

#include "yuv2rgb.h"
#include "yuv2rgb_altivec.h"
#include "yuv2rgb_ppc.h"


	//	Comment out to turn off the Assembly and use the C code!
#define USE_PPC_ASM

	// Extra Lossy U  Factor of 1.962758002  so round to 2!   Actually 1.969237695   because   n/2+n/128
	// Extra Lossy V Factor of 5.155686762 so round to 5!     Actually 5.0
#define USE_PPC_LOSSY

	//	Comment out to use Assembly Multiplication instead of table lookup!  (Saves about 16 clockcycles per pixel pair)
	//	but costs 3K of memory!
	//	Note: USE_PPC_LOSSY must be on!
#define USE_PPC_MULTBL


extern int gHasAltivec;				// Has an AltiVec flag (set by Framework)



//
//	Function Pointers
//

yuv2rgbProcPtr YV12toRGB32;	
yuv2rgbProcPtr YV12toRGB24;	
yuv2rgbProcPtr YV12toRGB555;	
yuv2rgbProcPtr YV12toRGB565;
yuv2rgbProcPtr YV12toYUY2;
yuv2rgbProcPtr YV12toYV12;
yuv2rgbProcPtr YV12toUYVY;



#ifdef USE_PPC_MULTBL
	//	Mutiplication Lookup table		(FIXME: Convert to Static Const instead of Dynamic!)
static int UTblMult[256];
static int VTblMult[256];
static int YTblMult[256];
#endif



/***

  /  2568      0   3343  \              
 |   2568  -0c92  -1a1e   | / 65536 * 8 
  \  2568   40cf      0  /              

    Y -= 16;
    U -= 128;
    V -= 128;

    R = (0x2568*Y + 0x0000*V + 0x3343*U) / 0x2000;
    G = (0x2568*Y - 0x0c92*V - 0x1a1e*U) / 0x2000;
    B = (0x2568*Y + 0x40cf*V + 0x0000*U) / 0x2000;

    R = R>255 ? 255 : R;
    R = R<0   ?   0 : R;

    G = G>255 ? 255 : G;
    G = G<0   ?   0 : G;

    B = B>255 ? 255 : B;
    B = B<0   ?   0 : B;

***/

#define VAL_Ru			0x3343		// Ru=13123v
#define VAL_Gv			0x0c92		// Gv=3218u
#define VAL_RGBy		0x2568		// RGBy = 9576y
#define VAL_Gu			0x1a1e
#define VAL_Bv			0x40cf


#define _S(a)		(a)>255 ? 255 : (a)<0 ? 0 : (a)

#define _R(y,u,v) (0x2568*(y)  			       + 0x3343*(u)) /0x2000
#define _G(y,u,v) (0x2568*(y) - 0x0c92*(v) - 0x1a1e*(u)) /0x2000
#define _B(y,u,v) (0x2568*(y) + 0x40cf*(v))					     /0x2000

/* all stride values are in _bytes_ */



//
//	Convert YUV to 32bit RGB (PowerPC)
//
// (Packed unsigned char YUV data)

void ppc_YV12toRGB32(
				uint8_t *puc_y,				// r3
				int stride_y,				// r4 Stride for puc_y
                uint8_t *puc_u,				// r5
				uint8_t *puc_v,				// r6
				int stride_uv, 				// r7 Stride for puc_u and puc_v
                uint8_t *pImg,				// r8
				int width,					// r9
				int height, 				// r10
                register unsigned int stride)		// r?? Stride of Frame Buffer
{
		// OPTIMIZATION=1+  (r3...r10) ON STACK  (r11...r31) available!
#ifdef USE_PPC_ASM
unsigned int register _pU, _pV, _r18,_w,_flip, _r31;
unsigned int register _y0, _u0, _v0, _r0, _g0, _b0;
unsigned int register _y1, _u1, _v1, _r1, _g1, _b1;

#ifdef USE_PPC_MULTBL
int register * _pYTbl;
int register * _pUTbl;
int register * _pVTbl;

_pUTbl = UTblMult;
_pVTbl = VTblMult;
_pYTbl = YTblMult;
#endif

	// Compiler not assigning r11 to stride (too many arguments)!!!
__asm {
		slwi	_r31,r9,2		// (width *4)
		sub		r4,r4,r9		// stride_y -= width
		li		_r18,0xFFFF		// = 0xFFFFFFFF
		li		_flip,0			// = 0x00000000 (Flip/Flop) Mask 0=Even Scan -1=Odd Scan
		sub		r19,r19,_r31	// stride -= (width * 4)  // stride adjustment
		sub		stride,stride,_r31

high0:
		mr		_pU,r5			// U Table (odd/even 8 bit)
		mr		_pV,r6			// V Table (odd/even 8 bit)
		mr		_w,r9			// w=width

wide0:		// Note: y0 & y1 share  u0 & v0, but we process pixel pairs!
		lbz		_v0,0(_pV)		// v=puc_v[x>>1]-128
		lbz		_u0,0(_pU)		// u=puc_u[x>>1]-128
		lbz		_y0,0(r3) 		// y=puc_y[x] -16
		lbz		_y1,1(r3) 		// y=puc_y[x+1]-16

	// YUV to RGB Math
	//	R=(9576y + 13123u) / 8192           _R(y,u,v) (0x2568*(y)  	       + 0x3343*(u)) /0x2000
	//	G=(9576y - 3218v - 6686u) / 8192    _G(y,u,v) (0x2568*(y) - 0x0c92*(v) - 0x1a1e*(u)) /0x2000
	//	B=(9576y + 16591v) / 8192           _B(y,u,v) (0x2568*(y) + 0x40cf*(v)) / 0x2000		


#ifdef USE_PPC_LOSSY	// Extra Lossy U  Factor of 1.962758002  so round to 2!
		// 		Actually 1.969237695   because   n/2+n/128
		// Extra Lossy V Factor of 5.155686762 so round to 5!
		// 		Actually 5.0

#ifdef USE_PPC_MULTBL
			//	Use Multiplication YUV Lookup tables
		slwi	_v0,_v0,2			// x4 (int table index)
		slwi	_u0,_u0,2
		slwi	_y0,_y0,2
		slwi	_y1,_y1,2

		lwzx	_g0,_pVTbl,_v0		// lookup int
		lwzx	_r0,_pUTbl,_u0
		lwzx	_y0,_pYTbl,_y0
		lwzx	_y1,_pYTbl,_y1

		slwi	_v0,_g0,2			//x4	12872v=(3218v*4)
		srawi	_u0,_r0,1			// 13123u / 2        {2.0}
		srawi	_r31,_r0,7			// 13123u / 128      {1.969237695}

#else
		subi	_v0,_v0,128
		subi	_u0,_u0,128
		subi	_y0,_y0,16
		subi	_y1,_y1,16

		mulli	_g0,_v0,0x0c92	// (Gv)   3218v
		mulli	_r0,_u0,0x3343	// (Ru)   13123u

		slwi	_v0,_g0,2			//x4	12872v=(3218v*4)
		srawi	_u0,_r0,1			// 13123u / 2        {2.0}
		mulli	_y0,_y0,0x2568	// (RGBy) 9576y
		srawi	_r31,_r0,7		// 13123u / 128      {1.969237695}
		mulli	_y1,_y1,0x2568
#endif
		add		_u0,_u0,_r31

		add		_b0,_v0,_g0		//x5	16090v=(3218v*5)
		sub		_g1,_y1,_g0
		sub		_g0,_y0,_g0

		add		_r1,_r0,_y1
		add		_r0,_r0,_y0
		sub		_g1,_g1,_u0
		sub		_g0,_g0,_u0
		add		_b1,_b0,_y1
		add		_b0,_b0,_y0
#else
		subi	_v0,_v0,128
		subi	_u0,_u0,128
		subi	_y0,_y0,16
		subi	_y1,_y1,16

		mulli	_g0,_v0,0x0c92	// (Gv)   3218v
		mulli	_y1,_y1,0x2568
		mulli	_y0,_y0,0x2568	// (RGBy) 9576y
		sub		_g1,_y1,_g0
		sub		_g0,_y0,_g0

		mulli	_r0,_u0,0x3343	// (Ru)   13123u
		mulli	_b0,_v0,0x40cf	// (Bv)   16591v      {5.155686762}
		mulli	_u0,_u0,0x1a1e	// (Gu)   6686u       {1.962758002}

		add		_r1,_r0,_y1
		add		_r0,_r0,_y0
		sub		_g1,_g1,_u0
		sub		_g0,_g0,_u0
		add		_b1,_b0,_y1
		add		_b0,_b0,_y0
#endif

		srawi	_r1,_r1,13
		srawi	_r0,_r0,13		// r/=8192
		srawi	_g1,_g1,13
		srawi	_g0,_g0,13		// g/=8192
		srawi	_b1,_b1,13
		srawi	_b0,_b0,13		// b/=8192

			// Pack signed 32bit into unsigned 8bit

		srawi	_y1,_r1,31
		srawi	_y0,_r0,31		// (+)=00000000  (-)=FFFFFFFF  (OR) {Shift sign bit into all bits!}
		srawi	_u1,_g1,31
		srawi	_u0,_g0,31		// (+)=00000000  (-)=FFFFFFFF  (OR) {Shift sign bit into all bits!}
		srawi	_v1,_b1,31
		srawi	_v0,_b0,31		// (+)=00000000  (-)=FFFFFFFF  (OR) {Shift sign bit into all bits!}

		xor		_y1,_y1,_r18
		xor		_y0,_y0,_r18		// AND MASK   (+)=FFFFFFFF  (-)=00000000 {Neg #'s become 0}
		xor		_u1,_u1,_r18
		xor		_u0,_u0,_r18		// AND MASK   (+)=FFFFFFFF  (-)=00000000 {Neg #'s become 0}
		xor		_v1,_v1,_r18
		xor		_v0,_v0,_r18		// AND MASK   (+)=FFFFFFFF  (-)=00000000 {Neg #'s become 0}

		and		_r1,_r1,_y1
		and		_r0,_r0,_y0		// (-)'s become 0, else 1...7FFF
		and		_g1,_g1,_u1
		and		_g0,_g0,_u0		// (-)'s become 0, else 1...7FFF
		and		_b1,_b1,_v1
		and		_b0,_b0,_v0		// (-)'s become 0, else 1...7FFF

		addi	_y1,_r1,0xFF00
		addi	_y0,_r0,0xFF00	// -=256 is += 0xFFFFFF00		
		addi	_u1,_g1,0xFF00
		addi	_u0,_g0,0xFF00	// -=256 is += 0xFFFFFF00		
		addi	_v1,_b1,0xFF00
		addi	_v0,_b0,0xFF00	// -=256 is += 0xFFFFFF00		

		srawi	_y1,_y1,16
		srawi	_y0,_y0,16		// (0...FF)=FFFFFFFF else 00000000
		srawi	_u1,_u1,16
		srawi	_u0,_u0,16		// (0...FF)=FFFFFFFF else 00000000
		srawi	_v1,_v1,16
		srawi	_v0,_v0,16		// (0...FF)=FFFFFFFF else 00000000

		xor		_y1,_y1,_r18
		xor		_y0,_y0,_r18		// OR MASK
		xor		_u1,_u1,_r18
		xor		_u0,_u0,_r18		// OR MASK
		xor		_v1,_v1,_r18
		xor		_v0,_v0,_r18		// OR MASK

		subi	_w,_w,2				// w--

		or		_r1,_r1,_y1
		or		_r0,_r0,_y0		// Use OR Mask (>255) mask=FF, otherwise mask=00
		or		_g1,_g1,_u1
		or		_g0,_g0,_u0		// Use OR Mask (>255) mask=FF, otherwise mask=00
		or		_b1,_b1,_v1
		or		_b0,_b0,_v0		// Use OR Mask (>255) mask=FF, otherwise mask=00

		// Output RGB  [0]=0 [1]=b [2]=g [3]=r  (Big Endian)

		cmpwi	_w,0
	
		rlwimi	_r1,_g1,8,16,23
		rlwimi	_r0,_g0,8,16,23	// = [....ggrr]		

		addi	r3,r3,2   		//_pY,_pY,1				// Y-tbl++
		addi	_pU,_pU,1

		rlwimi	_r1,_b1,16,8,15
		rlwimi	_r0,_b0,16,8,15	// = [..bbggrr]		

		addi	_pV,_pV,1
		addi	r8,r8,8

		stw		_r1,4-8(r8)
		stw		_r0,0-8(r8)			// Write Pixel

		bne		wide0				// _w==0?  *** Loop for all pixels in scanline ***

			// ----------------------
			// We're going to play flip/flop games with the UV stride

		subi	r10,r10,1			// h--
		and		_r31,_flip,r7		// even:(0)  odd:(stride)		
		xor		_flip,_flip,_r18	// Flip/Flop
		add		r3,r3,r4			// puc_y += (delta) stride     Next scanline!

		cmpwi	r10,0

				// Only advance on completion of an (odd) scanline!

		add		r5,r5,_r31			// U Table next scan pair
		add		r6,r6,_r31			// V Table next scan pair
		add		r8,r8,stride		// pImg += stride (adjustment)

		bne		high0				// r10==0?  *** Loop for all scanlines in image ***
	}

#else
	int x, y;

	stride -= (width<<2);			// Change to a stride adjustment for image frame buffer

	for (y=0; y<height; y++) 
	{
		for (x=0; x<width; x++)
		{
			signed int _r,_g,_b; 
			signed int r, g, b;
			signed int y, u, v;

			y = puc_y[x] - 16;
			u = puc_u[x>>1] - 128;
			v = puc_v[x>>1] - 128;

//#define _R(y,u,v) (0x2568*(y)  	       + 0x3343*(u)) /0x2000
//#define _G(y,u,v) (0x2568*(y) - 0x0c92*(v) - 0x1a1e*(u)) /0x2000
//#define _B(y,u,v) (0x2568*(y) + 0x40cf*(v))					     /0x2000
			_r = _R(y,u,v);
			_g = _G(y,u,v);
			_b = _B(y,u,v);

//#define _S(a)		(a)>255 ? 255 : (a)<0 ? 0 : (a)
			r = _S(_r);
			g = _S(_g);
			b = _S(_b);

			pImg[0] = 0;	//r;
			pImg[1] = b;	//g;
			pImg[2] = g;	//b;
			pImg[3] = r;	//0;

			pImg += 4;
		}

		puc_y += stride_y;

		if (y&1)		// 0/1  (even/odd) Scanlines share table!
		{
			puc_u   += stride_uv;
			puc_v   += stride_uv;
		}

		pImg += stride;		// Up two scanlines
	}
#endif
}



//
//	Convert YUV to 24bit RGB (PowerPC)
//

void ppc_YV12toRGB24(uint8_t *puc_y, int stride_y, 
                uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
                uint8_t *puc_out, int width_y, int height_y,
                unsigned int stride_out ) 
{

	int x, y;

	puc_out = puc_out + width_y * (height_y-1) * 3;

	for (y=0; y<height_y; y++) 
	{
		for (x=0; x<width_y; x++)
		{
			signed int _r,_g,_b; 
			signed int r, g, b;
			signed int y, u, v;

			y = puc_y[x] - 16;
			u = puc_u[x>>1] - 128;
			v = puc_v[x>>1] - 128;

			_r = _R(y,u,v);
			_g = _G(y,u,v);
			_b = _B(y,u,v);

			r = _S(_r);
			g = _S(_g);
			b = _S(_b);

			puc_out[0] = r;
			puc_out[1] = g;
			puc_out[2] = b;

			puc_out+=3;
		}

		puc_y   += stride_y;
		if (y%2) {
			puc_u   += stride_uv;
			puc_v   += stride_uv;
		}
		puc_out -= (width_y*6);
	}
}


//
//	Convert YUV to 15bit (5:5:5) RGB (PowerPC)
//

/* this routine needs more testing  */

#define _mR	0x7c00
#define _mG 0x03e0
#define _mB 0x001f

#define _Ps(r,g,b) (((r) << 7) & _mR) | (((g) << 2) & _mG) | (((b) >> 3) & _mB)

void ppc_YV12toRGB555(uint8_t *puc_y, int stride_y, 
                uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
                uint8_t *puc_out, int width_y, int height_y,
                unsigned int stride_out ) 
{

	int x, y;
	unsigned short *pus_out;

	puc_out = puc_out + width_y * (height_y-1) * 2;
	pus_out = (unsigned short *) puc_out;

	for (y=0; y<height_y; y++) 
	{
		for (x=0; x<width_y; x++)
		{
			signed int _r,_g,_b; 
			signed int r, g, b;
			signed int y, u, v;

			y = puc_y[x] - 16;
			u = puc_u[x>>1] - 128;
			v = puc_v[x>>1] - 128;

			_r = _R(y,u,v);
			_g = _G(y,u,v);
			_b = _B(y,u,v);

			r = _S(_r);
			g = _S(_g);
			b = _S(_b);

			pus_out[0] = _Ps(r,g,b);

			pus_out++;
		}

		puc_y   += stride_y;
		if (y%2) {
			puc_u   += stride_uv;
			puc_v   += stride_uv;
		}
		pus_out -= (width_y*2);
	}
}



//
//	Convert YUV to 16bit (5:6:5) RGB (PowerPC)
//
	
void ppc_YV12toRGB565(uint8_t *puc_y, int stride_y, 
  uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
  uint8_t *puc_out, int width_y, int height_y, unsigned int stride_out)
  {}

void ppc_YV12toYUY2(uint8_t *puc_y, int stride_y, 
  uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
  uint8_t *puc_out, int width_y, int height_y, unsigned int stride_out)
  {}

void ppc_YV12toYV12(uint8_t *puc_y, int stride_y, 
  uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
  uint8_t *puc_out, int width_y, int height_y, unsigned int stride_out)
  {}
   
void ppc_YV12toUYVY(uint8_t *puc_y, int stride_y, 
  uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
  uint8_t *puc_out, int width_y, int height_y, unsigned int stride_out)
  {}



//
//	Initialize yuv2rgb Function Pointers
//

void yuv2rgb_init( void )
{
#ifdef USE_PPC_MULTBL
	int i, uv, y;

		//	Build Multiplication Lookup table
	for ( i = 0; i < 256; i++ )
	  {
		uv = i - 128;
		y = i - 16;

		VTblMult[i] = VAL_Gv * uv;
		UTblMult[i] = VAL_Ru * uv;
		YTblMult[i] = VAL_RGBy * y;
	  }
#endif

		//	Setup Function pointers

	if ( gHasAltivec )
	  {
		YV12toRGB32 = avec_YV12toRGB32;
		YV12toRGB24 = avec_YV12toRGB24;
		YV12toRGB555 = avec_YV12toRGB555;
		YV12toRGB565 = avec_YV12toRGB565;
		YV12toYUY2 = avec_YV12toYUY2;
		YV12toYV12 = avec_YV12toYV12;
		YV12toUYVY = avec_YV12toUYVY;
	  }
	else
	  {
		YV12toRGB32 = ppc_YV12toRGB32;
		YV12toRGB24 = ppc_YV12toRGB24;
		YV12toRGB555 = ppc_YV12toRGB555;
		YV12toRGB565 = ppc_YV12toRGB565;
		YV12toYUY2 = ppc_YV12toYUY2;
		YV12toYV12 = ppc_YV12toYV12;
		YV12toUYVY = ppc_YV12toUYVY;
	  }
}
