/*
 Copyright (C) 2002 Andreas Thiede ( a.thiede@berlin.de )

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include <memory.h>

#include <inttypes.h>
//#include "portab.h"
#include "yuv2rgb.h"
#include "yuv2rgb_altivec.h"
#include "yuv2rgb_ppc.h"


//	JIM - Comment out to turn off the Assembly and use the C code!
#define USE_PPC_ASM

//	JIM - Comment out to reduce use some optimization and register usage, so debugging can occur!
#define USE_EXTRA_OPTIMIZE



//
//	NOTE: If a bad data alignment, then function will be routed to the PowerPC specific code!
//



//
//	Convert YUV to 32bit RGB (PowerPC-AltiVec)
//

void avec_YV12toRGB32(uint8_t *puc_y, int stride_y, 
                uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
                uint8_t *puc_out, int width_y, int height_y,
								unsigned int _stride_out)
{
#ifndef USE_PPC_ASM
	ppc_YV12toRGB32(puc_y, stride_y, puc_u, puc_v, stride_uv, puc_out, width_y, height_y, _stride_out);
#else

	int x, y;

		//	If improperly aligned, fall back to PowerPC coding
//if ( (int)puc_out & 15 )
//{
//	ppc_YV12toRGB32(puc_y, stride_y, puc_u, puc_v, stride_uv, puc_out, width_y, height_y, _stride_out);
//	return;
//}

	if (width_y & 15)
	{
		int w = width_y & ~15;
		ppc_YV12toRGB32(puc_y + w, stride_y, puc_u + (w >> 1), puc_v + (w >> 1), stride_uv, puc_out + (w << 2), width_y - w, height_y, _stride_out);
	}
	
	width_y &= ~15;

	{
		register vector signed short rpy = (vector signed short) ((short) (16384. / 256.));
		register vector signed short rpv = (vector signed short) ((short) (16384. * 1.4020 / 256.));
		register vector signed short gpu = (vector signed short) ((short) (16384. * -0.3441 / 256.));
		register vector signed short gpv = (vector signed short) ((short) (16384. * -0.7139 / 256.));
		register vector signed short bpu = (vector signed short) ((short) (16384. * 1.7718 / 256.));

		register vector signed short rc = (vector signed short) ((short) (16384. * -1.4020 * .5 - 16384. / 16.));
		register vector signed short gc = (vector signed short) ((short) (16384. * 0.3441 * .5 + 16384. * 0.7139 * .5 - 16384. / 16.));
		register vector signed short bc = (vector signed short) ((short) (16384. * -1.7718 * .5 - 16384. / 16.));
		register vector unsigned char zero = (vector unsigned char) (0);
		register vector unsigned short shft = (vector unsigned short) (6);

		register vector unsigned char permh = (vector unsigned char) (0, 1, 17, 3, 0, 5, 19, 7, 0, 9, 21, 11, 0, 13, 23, 15);
		register vector unsigned char perml = (vector unsigned char) (0, 1, 25, 3, 0, 5, 27, 7, 0, 9, 29, 11, 0, 13, 31, 15);
		register vector signed short v255 = (vector signed short) (255);

		register int i16 = 16;
#ifdef USE_EXTRA_OPTIMIZE
		register int i32 = 32;
		register int i48 = 48;
#endif

		for (y = 0; y < height_y; ++y)
		{
			register unsigned char *pvid = (unsigned char *) puc_out;
			vector unsigned char vyperm, vuvperm;
			vector unsigned char vyl, vul, vvl;

			vyperm = vec_lvsl (0, puc_y);
			vuvperm = vec_lvsl (0, puc_u);

			vyl = vec_lvx (0, puc_y);
			vul = vec_lvx (0, puc_u);
			vvl = vec_lvx (0, puc_v);

			for (x = 0; x + 15 < width_y; x += 16)
			{
				register vector unsigned char vy, vu, vv;
				register vector signed short cy0, cy1;
				register vector signed short cu0, cu1;
				register vector signed short cv0, cv1;
				register vector signed short qr0, qr1;
				register vector signed short qg0, qg1;
				register vector signed short qb0, qb1;
				register vector unsigned char temp0;
				register vector unsigned char temp1;
				register vector unsigned char temp2;
				register vector unsigned char temp3;

				temp0 = vec_lvx (x + 16, puc_y);
				temp1 = vec_lvx (((unsigned int) x >> 1) & ~15, puc_u + 16);
				temp2 = vec_lvx (((unsigned int) x >> 1) & ~15, puc_v + 16);

				vy = vec_vperm (vyl, temp0, vyperm);
				vu = vec_vperm (vul, temp1, vuvperm);
				vv = vec_vperm (vvl, temp2, vuvperm);

				vyl = temp0;

				if ((x & 31) == 0)
				{
					vu = vec_vmrghb (vu, vu);
					vv = vec_vmrghb (vv, vv);
				}
				else
				{
					vu = vec_vmrglb (vu, vu);
					vv = vec_vmrglb (vv, vv);

					vul = temp1;
					vvl = temp2;
				}

				__asm__("vmrghb cv0, zero, vv");
				__asm__("vmrghb cu0, zero, vu");
				__asm__("vmrghb cy0, zero, vy");
				__asm__("vmrglb cv1, zero, vv");
				__asm__("vmrglb cu1, zero, vu");
				__asm__("vmrglb cy1, zero, vy");

                                __asm__("vmladduhm qg0, gpv, cv0, gc");
                                __asm__("vmladduhm qr0, rpv, cv0, rc");
                                __asm__("vmladduhm qb0, bpu, cu0, bc");
                                __asm__("vmladduhm qg0, gpu, cu0, qg0");

                                __asm__("vmladduhm qg1, gpv, cv1, gc");
                                __asm__("vmladduhm qr1, rpv, cv1, rc");
                                __asm__("vmladduhm qb1, bpu, cu1, bc");
                                __asm__("vmladduhm qg1, gpu, cu1, qg1");

                                __asm__("vmladduhm qr0, rpy, cy0, qr0");
                                __asm__("vmladduhm qb0, rpy, cy0, qb0");
                                __asm__("vmladduhm qg0, rpy, cy0, qg0");

                                __asm__("vmladduhm qr1, rpy, cy1, qr1");
                                __asm__("vmladduhm qb1, rpy, cy1, qb1");
				__asm__("vmladduhm qg1, rpy, cy1, qg1");

                                __asm__("vmaxsh qr0, qr0, zero");
                                __asm__("vmaxsh qg0, qg0, zero");
                                __asm__("vmaxsh qb0, qb0, zero");

                                __asm__("vmaxsh qr1, qr1, zero");
                                __asm__("vmaxsh qg1, qg1, zero");
                                __asm__("vmaxsh qb1, qb1, zero");

                                __asm__("vsrh qr0, qr0, shft");
                                __asm__("vsrh qg0, qg0, shft");
                                __asm__("vsrh qb0, qb0, shft");

                                __asm__("vsrh qr1, qr1, shft");
                                __asm__("vsrh qg1, qg1, shft");
                                __asm__("vsrh qb1, qb1, shft");

                                __asm__("vminsh qr0, qr0, v255");
                                __asm__("vminsh qg0, qg0, v255");
                                __asm__("vminsh qb0, qb0, v255");

                                __asm__("vminsh qr1, qr1, v255");
                                __asm__("vminsh qg1, qg1, v255");
                                __asm__("vminsh qb1, qb1, v255");

                                __asm__("vmrghh temp0, qr0, qb0");
                                __asm__("vmrglh temp1, qr0, qb0");
                                __asm__("vmrghh temp2, qr1, qb1");
                                __asm__("vmrglh temp3, qr1, qb1");

#ifdef USE_EXTRA_OPTIMIZE		// Extra Optimization
                                __asm__("vperm temp0, temp0, qg0, permh");
                                __asm__("vperm temp1, temp1, qg0, perml");
                                __asm__("vperm temp2, temp2, qg1, permh");
                                __asm__("vperm temp3, temp3, qg1, perml");

                                __asm__("stvx temp0, 0, pvid");
                                __asm__("stvx temp1, i16, pvid");
                                __asm__("stvx temp2, i32, pvid");
                                __asm__("stvx temp3, i48, pvid");
#else							// (Reduce Optimization for Debugging)
                                __asm__("vperm temp0, temp0, qg0, permh");
                                __asm__("vperm temp1, temp1, qg0, perml");
                                __asm__("vperm temp2, temp2, qg1, permh");
                                __asm__("vperm temp3, temp3, qg1, perml");

                                __asm__("stvx temp0, 0, pvid");
                                __asm__("li i16,16");
                                __asm__("stvx temp1, i16, pvid");
                                __asm__("li i16,32");
                                __asm__("stvx temp2, i16, pvid");
                                __asm__("li i16,48");
                                __asm__("stvx temp3, i16, pvid");
#endif
				pvid += 64;
			}
			puc_y += stride_y;
			if (y & 1)
			{
				puc_u += stride_uv;
				puc_v += stride_uv;
			}
			puc_out += _stride_out;
		}
	}
#endif
}



//
//	Convert YUV to 15bit (5:5:5) RGB (PowerPC-AltiVec)
//

void avec_YV12toRGB555(uint8_t *puc_y, int stride_y, 
                uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
                uint8_t *puc_out, int width_y, int height_y,
								unsigned int _stride_out)
{
#ifndef USE_PPC_ASM
	// FIXME - Error : out of registers for local variables!  cv0, qb0
		// TURN OFF OPTIMIZATION and prune 2 registers from use!
		// also an error about illegal name overloading!

	ppc_YV12toRGB555(puc_y, stride_y, puc_u, puc_v, stride_uv, puc_out, width_y, height_y, _stride_out);
#else
	int x, y;

	if (width_y & 15)
	{
		int w = width_y & ~15;
		ppc_YV12toRGB555(puc_y + w, stride_y, puc_u + (w >> 1), puc_v + (w >> 1), stride_uv, puc_out + (w << 1), width_y - w, height_y, _stride_out);
	}
	width_y &= ~15;

	{
		register vector signed short rpy = (vector signed short) ((short) (16384. / 256.));
		register vector signed short rpv = (vector signed short) ((short) (16384. * 1.4020 / 256.));
		register vector signed short gpu = (vector signed short) ((short) (16384. * -0.3441 / 256.));
		register vector signed short gpv = (vector signed short) ((short) (16384. * -0.7139 / 256.));
		register vector signed short bpu = (vector signed short) ((short) (16384. * 1.7718 / 256.));

		register vector signed short rc = (vector signed short) ((short) (16384. * -1.4020 * .5 - 16384. / 16.));
		register vector signed short gc = (vector signed short) ((short) (16384. * 0.3441 * .5 + 16384. * 0.7139 * .5 - 16384. / 16.));
		register vector signed short bc = (vector signed short) ((short) (16384. * -1.7718 * .5 - 16384. / 16.));
		register vector unsigned char zero = (vector unsigned char) (0);
		register vector unsigned short shft = (vector unsigned short) (6);

		register vector unsigned char permh = (vector unsigned char) (0, 1, 17, 3, 0, 5, 19, 7, 0, 9, 21, 11, 0, 13, 23, 15);
		register vector unsigned char perml = (vector unsigned char) (0, 1, 25, 3, 0, 5, 27, 7, 0, 9, 29, 11, 0, 13, 31, 15);
		register vector signed short v255 = (vector signed short) (255);

		register int i16 = 16;

		for (y = 0; y < height_y; ++y)
		{
			register unsigned char *pvid = (unsigned char *) puc_out;
			vector unsigned char vyperm, vuvperm;
			vector unsigned char vyl, vul, vvl;

			vyperm = vec_lvsl (0, puc_y);
			vuvperm = vec_lvsl (0, puc_u);

			vyl = vec_lvx (0, puc_y);
			vul = vec_lvx (0, puc_u);
			vvl = vec_lvx (0, puc_v);

			for (x = 0; x + 15 < width_y; x += 16)
			{
				register vector unsigned char vy, vu, vv;
				register vector signed short cy0, cy1;
				register vector signed short cu0, cu1;
				register vector signed short cv0, cv1;
				register vector signed short qr0, qr1;
				register vector signed short qg0, qg1;
				register vector signed short qb0, qb1;
				register vector unsigned char temp0;
				register vector unsigned char temp1;
				register vector unsigned char temp2;
				register vector unsigned char temp3;

				temp0 = vec_lvx (x + 16, puc_y);
				temp1 = vec_lvx (((unsigned int) x >> 1) & ~15, puc_u + 16);
				temp2 = vec_lvx (((unsigned int) x >> 1) & ~15, puc_v + 16);

				vy = vec_vperm (vyl, temp0, vyperm);
				vu = vec_vperm (vul, temp1, vuvperm);
				vv = vec_vperm (vvl, temp2, vuvperm);

				vyl = temp0;

				if ((x & 31) == 0)
				{
					vu = vec_vmrghb (vu, vu);
					vv = vec_vmrghb (vv, vv);
				}
				else
				{
					vu = vec_vmrglb (vu, vu);
					vv = vec_vmrglb (vv, vv);

					vul = temp1;
					vvl = temp2;
				}

				__asm__("vmrghb cv0, zero, vv");
                                __asm__("vmrghb cu0, zero, vu");
                                __asm__("vmrghb cy0, zero, vy");
                                __asm__("vmrglb cv1, zero, vv");
                                __asm__("vmrglb cu1, zero, vu");
                                __asm__("vmrglb cy1, zero, vy");

                                __asm__("vmladduhm qg0, gpv, cv0, gc");
                                __asm__("vmladduhm qr0, rpv, cv0, rc");
                                __asm__("vmladduhm qb0, bpu, cu0, bc");
                                __asm__("vmladduhm qg0, gpu, cu0, qg0");

                                __asm__("vmladduhm qg1, gpv, cv1, gc");
                                __asm__("vmladduhm qr1, rpv, cv1, rc");
                                __asm__("vmladduhm qb1, bpu, cu1, bc");
                                __asm__("vmladduhm qg1, gpu, cu1, qg1");

                                __asm__("vmladduhm qr0, rpy, cy0, qr0");
                                __asm__("vmladduhm qb0, rpy, cy0, qb0");
                                __asm__("vmladduhm qg0, rpy, cy0, qg0");

                                __asm__("vmladduhm qr1, rpy, cy1, qr1");
                                __asm__("vmladduhm qb1, rpy, cy1, qb1");
                                __asm__("vmladduhm qg1, rpy, cy1, qg1");

                                __asm__("vmaxsh qr0, qr0, zero");
                                __asm__("vmaxsh qg0, qg0, zero");
                                __asm__("vmaxsh qb0, qb0, zero");

                                __asm__("vmaxsh qr1, qr1, zero");
                                __asm__("vmaxsh qg1, qg1, zero");
                                __asm__("vmaxsh qb1, qb1, zero");

                                __asm__("vsrh qr0, qr0, shft");
                                __asm__("vsrh qg0, qg0, shft");
                                __asm__("vsrh qb0, qb0, shft");

                                __asm__("vsrh qr1, qr1, shft");
                                __asm__("vsrh qg1, qg1, shft");
                                __asm__("vsrh qb1, qb1, shft");

                                __asm__("vminsh qr0, qr0, v255");
                                __asm__("vminsh qg0, qg0, v255");
                                __asm__("vminsh qb0, qb0, v255");

                                __asm__("vminsh qr1, qr1, v255");
                                __asm__("vminsh qg1, qg1, v255");
                                __asm__("vminsh qb1, qb1, v255");

                                __asm__("vmrghh temp0, qr0, qb0");
                                __asm__("vmrglh temp1, qr0, qb0");
                                __asm__("vmrghh temp2, qr1, qb1");
                                __asm__("vmrglh temp3, qr1, qb1");

                                __asm__("vperm temp0, temp0, qg0, permh");
                                __asm__("vperm temp1, temp1, qg0, perml");
                                __asm__("vperm temp2, temp2, qg1, permh");
                                __asm__("vperm temp3, temp3, qg1, perml");

                                __asm__("vpkpx temp0, temp0, temp1");
                                __asm__("vpkpx temp2, temp2, temp3");
                                __asm__("stvx temp0, 0, pvid");
                                __asm__("stvx temp2, i16, pvid");

				pvid += 32;
			}
			puc_y += stride_y;
			if (y & 1)
			{
				puc_u += stride_uv;
				puc_v += stride_uv;
			}
			puc_out += _stride_out;
		}
	}
#endif
}



//
//	Convert YUV to 16bit (5:6:5) RGB (PowerPC)
//
	
void avec_YV12toRGB24(uint8_t *puc_y, int stride_y, 
  uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
  uint8_t *puc_out, int width_y, int height_y, unsigned int stride_out)
{
	ppc_YV12toRGB24(puc_y, stride_y, puc_u, puc_v, stride_uv, puc_out, width_y, height_y, stride_out);
}

void avec_YV12toRGB565(uint8_t *puc_y, int stride_y, 
  uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
  uint8_t *puc_out, int width_y, int height_y, unsigned int stride_out)
{
	ppc_YV12toRGB565(puc_y, stride_y, puc_u, puc_v, stride_uv, puc_out, width_y, height_y, stride_out);
}

void avec_YV12toYUY2(uint8_t *puc_y, int stride_y, 
  uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
  uint8_t *puc_out, int width_y, int height_y, unsigned int stride_out)
{
	ppc_YV12toYUY2(puc_y, stride_y, puc_u, puc_v, stride_uv, puc_out, width_y, height_y, stride_out);
}

void avec_YV12toYV12(uint8_t *puc_y, int stride_y, 
  uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
  uint8_t *puc_out, int width_y, int height_y, unsigned int stride_out)
{
	ppc_YV12toYV12(puc_y, stride_y, puc_u, puc_v, stride_uv, puc_out, width_y, height_y, stride_out);
}
   
void avec_YV12toUYVY(uint8_t *puc_y, int stride_y, 
  uint8_t *puc_u, uint8_t *puc_v, int stride_uv, 
  uint8_t *puc_out, int width_y, int height_y, unsigned int stride_out)
{
	ppc_YV12toUYVY(puc_y, stride_y, puc_u, puc_v, stride_uv, puc_out, width_y, height_y, stride_out);
}
