/*
 * This file is part of the Advance project.
 *
 * Copyright (C) 1999-2002 Andrea Mazzoleni
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * In addition, as a special exception, Andrea Mazzoleni
 * gives permission to link the code of this program with
 * the MAME library (or with modified versions of MAME that use the
 * same license as MAME), and distribute linked combinations including
 * the two.  You must obey the GNU General Public License in all
 * respects for all of the code used other than MAME.  If you modify
 * this file, you may extend this exception to your version of the
 * file, but you are not obligated to do so.  If you do not wish to
 * do so, delete this exception statement from your version.
 */

/*
 * Alternatively at the previous license terms, you are allowed to use this
 * code in your program with these conditions:
 * - the program is not used in commercial activities.
 * - the whole source code of the program is released with the binary.
 */

typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;

#ifdef _MSC_VER
typedef unsigned __int64 u64;
#else
typedef unsigned long long u64;
#endif

typedef signed char s8;
typedef signed short s16;
typedef signed int s32;

#ifdef _MSC_VER
typedef signed __int64 s64;
#else
typedef signed long long s64;
#endif


static void internal_scale2x_16_mmx_single(u16* dst, const u16* src0, const u16* src1, const u16* src2, unsigned count)
{
	/* always do the first and last run */
	count -= 2 * 4;

	__asm {
		mov			eax, src0;
		mov			ebx, src1;
		mov			ecx, src2;
		mov			edx, dst;
		mov			esi, count;

		/* first run */
		/* set the current, current_pre, current_next registers */
		pxor		mm0, mm0; /* use a fake black out of screen */
		movq		mm7, qword ptr [ebx];
		movq		mm1, qword ptr [ebx + 8];
		;psrlq		mm0, 48;
		psllq		mm1, 48;
		movq		mm2, mm7;
		movq		mm3, mm7;
		psllq		mm2, 16;
		psrlq		mm3, 16;
		por			mm0, mm2;
		por			mm1, mm3;

		/* current_upper */
		movq		mm6, qword ptr [eax];

		/* compute the upper-left pixel for dst0 on %%mm2 */
		/* compute the upper-right pixel for dst0 on %%mm4 */
		movq		mm2, mm0;
		movq		mm4, mm1;
		movq		mm3, mm0;
		movq		mm5, mm1;
		pcmpeqw		mm2, mm6;
		pcmpeqw		mm4, mm6;
		pcmpeqw		mm3, qword ptr [ecx];
		pcmpeqw		mm5, qword ptr [ecx];
		pandn		mm3, mm2;
		pandn		mm5, mm4;
		movq		mm2, mm0;
		movq		mm4, mm1;
		pcmpeqw		mm2, mm1;
		pcmpeqw		mm4, mm0;
		pandn		mm2, mm3;
		pandn		mm4, mm5;
		movq		mm3, mm2;
		movq		mm5, mm4;
		pand		mm2, mm6;
		pand		mm4, mm6;
		pandn		mm3, mm7;
		pandn		mm5, mm7;
		por			mm2, mm3;
		por			mm4, mm5;

		/* set *dst0 */
		movq		mm3, mm2;
		punpcklwd	mm2, mm4;
		punpckhwd	mm3, mm4;
		movq		qword ptr [edx], mm2;
		movq		qword ptr [edx + 8], mm3;

		/* next */
		add			eax, 8;
		add			ebx, 8;
		add			ecx, 8;
		add			edx, 16;

		/* central runs */
		shr			esi, 2;
		jz			label1;
		align 4;
label0:

		/* set the current, current_pre, current_next registers */
		movq		mm0, qword ptr [ebx - 8];
		movq		mm7, qword ptr [ebx];
		movq		mm1, qword ptr [ebx + 8];
		psrlq		mm0, 48;
		psllq		mm1, 48;
		movq		mm2, mm7;
		movq		mm3, mm7;
		psllq		mm2, 16;
		psrlq		mm3, 16;
		por			mm0, mm2;
		por			mm1, mm3;

		/* current_upper */
		movq		mm6, qword ptr [eax];

		/* compute the upper-left pixel for dst0 on %%mm2 */
		/* compute the upper-right pixel for dst0 on %%mm4 */
		movq		mm2, mm0;
		movq		mm4, mm1;
		movq		mm3, mm0;
		movq		mm5, mm1;
		pcmpeqw		mm2, mm6;
		pcmpeqw		mm4, mm6;
		pcmpeqw		mm3, qword ptr [ecx];
		pcmpeqw		mm5, qword ptr [ecx];
		pandn		mm3, mm2;
		pandn		mm5, mm4;
		movq		mm2, mm0;
		movq		mm4, mm1;
		pcmpeqw		mm2, mm1;
		pcmpeqw		mm4, mm0;
		pandn		mm2, mm3;
		pandn		mm4, mm5;
		movq		mm3, mm2;
		movq		mm5, mm4;
		pand		mm2, mm6;
		pand		mm4, mm6;
		pandn		mm3, mm7;
		pandn		mm5, mm7;
		por			mm2, mm3;
		por			mm4, mm5;

		/* set *dst0 */
		movq		mm3, mm2;
		punpcklwd	mm2, mm4;
		punpckhwd	mm3, mm4;
		movq		qword ptr [edx], mm2;
		movq		qword ptr [edx + 8], mm3;

		/* next */
		add			eax, 8;
		add			ebx, 8;
		add			ecx, 8;
		add			edx, 16;

		dec			esi;
		jnz			label0;
label1:

		/* final run */
		/* set the current, current_pre, current_next registers */
		movq		mm0, qword ptr [ebx - 8];
		movq		mm7, qword ptr [ebx];
		pxor		mm1, mm1; /* use a fake black out of screen */
		psrlq		mm0, 48;
		;psllq		mm1, 48;
		movq		mm2, mm7;
		movq		mm3, mm7;
		psllq		mm2, 16;
		psrlq		mm3, 16;
		por			mm0, mm2;
		por			mm1, mm3;

		/* current_upper */
		movq		mm6, qword ptr [eax];

		/* compute the upper-left pixel for dst0 on %%mm2 */
		/* compute the upper-right pixel for dst0 on %%mm4 */
		movq		mm2, mm0;
		movq		mm4, mm1;
		movq		mm3, mm0;
		movq		mm5, mm1;
		pcmpeqw		mm2, mm6;
		pcmpeqw		mm4, mm6;
		pcmpeqw		mm3, qword ptr [ecx];
		pcmpeqw		mm5, qword ptr [ecx];
		pandn		mm3, mm2;
		pandn		mm5, mm4;
		movq		mm2, mm0;
		movq		mm4, mm1;
		pcmpeqw		mm2, mm1;
		pcmpeqw		mm4, mm0;
		pandn		mm2, mm3;
		pandn		mm4, mm5;
		movq		mm3, mm2;
		movq		mm5, mm4;
		pand		mm2, mm6;
		pand		mm4, mm6;
		pandn		mm3, mm7;
		pandn		mm5, mm7;
		por			mm2, mm3;
		por			mm4, mm5;

		/* set *dst0 */
		movq		mm3, mm2;
		punpcklwd	mm2, mm4;
		punpckhwd	mm3, mm4;
		movq		qword ptr [edx], mm2;
		movq		qword ptr [edx + 8], mm3;

		;mov			src0, eax;
		;mov			src1, ebx;
		;mov			src2, ecx;
		;mov			dst, edx;
		;mov			count, esi;

		;emms;
  }
}

static void internal_scale2x_16_mmx(u16* dst0, u16* dst1, const u16* src0, const u16* src1, const u16* src2, unsigned count)
{
	//assert(count >= 2 * 4);
	internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);
	internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);
}

void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
			  u8 *dstPtr, u32 dstPitch, int width, int height)
{
	u16 *dst0 = (u16 *)dstPtr;
	u16 *dst1 = dst0 + (dstPitch / 2);

	u16 *src0 = (u16 *)srcPtr;
	u16 *src1 = src0 + (srcPitch / 2);
	u16 *src2 = src1 + (srcPitch / 2);

	internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width);

	height--;
	while (--height) {
		dst0 += dstPitch;
		dst1 += dstPitch;
		internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width);
		src0 = src1;
		src1 = src2;
		src2 += srcPitch / 2;
	}
	dst0 += dstPitch;
	dst1 += dstPitch;
	internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width);
	//__asm emms;

}

/**
 * Scale by a factor of 3 a row of pixels of 16 bits.
 * This function operates like scale3x_8_def() but for 16 bits pixels.
 * \param src0 Pointer at the first pixel of the previous row.
 * \param src1 Pointer at the first pixel of the current row.
 * \param src2 Pointer at the first pixel of the next row.
 * \param count Length in pixels of the src0, src1 and src2 rows.
 * It must be at least 2.
 * \param dst0 First destination row, triple length in pixels.
 * \param dst1 Second destination row, triple length in pixels.
 * \param dst2 Third destination row, triple length in pixels.
 */
void AdMame3x(u8 *srcPtr, u32 srcPitch, u8 * /*deltaPtr*/ ,
			  u8 *dstPtr, u32 dstPitch, int width, int height){
u16* src0 = (u16*) srcPtr;
u16* src1 = (u16*) (srcPtr+srcPitch);
u16* src2 = (u16*) (srcPtr+(srcPitch*2));
u16* dst0 = (u16*) dstPtr;
u16* dst1 = (u16*) (dstPtr+dstPitch);
u16* dst2 = (u16*) (dstPtr+(dstPitch*2));
srcPitch = (srcPitch)>>1;
dstPitch = (dstPitch*3)>>1;
--height;
width -= 2;
srcPitch = srcPitch-width;
dstPitch = (dstPitch-(3*(width+1)));

	for (int i=0; i<height;i++){
	dst0 += 3;
	dst1 += 3;
	dst2 += 3;
    
	/* central pixels */
	//count -= 2;
	for (int j=0; j<width; j++) {
		if (src0[1] != src2[1] && src1[0] != src1[2]) {
			dst0[0] = src1[0] == src0[1] ? src1[0] : src1[1];
			dst0[1] = (src1[0] == src0[1] && src1[1] != src0[2]) || (src1[2] == src0[1] && src1[1] != src0[0]) ? src0[1] : src1[1];
			dst0[2] = src1[2] == src0[1] ? src1[2] : src1[1];

			dst1[0] = (src1[0] == src0[1] && src1[1] != src2[0]) || (src1[0] == src2[1] && src1[1] != src0[0]) ? src1[0] : src1[1];
			dst1[1] = src1[1];
			dst1[2] = (src1[2] == src0[1] && src1[1] != src2[2]) || (src1[2] == src2[1] && src1[1] != src0[2]) ? src1[2] : src1[1];

			dst2[0] = src1[0] == src2[1] ? src1[0] : src1[1];
			dst2[1] = (src1[0] == src2[1] && src1[1] != src2[2]) || (src1[2] == src2[1] && src1[1] != src2[0]) ? src2[1] : src1[1];
			dst2[2] = src1[2] == src2[1] ? src1[2] : src1[1];
		} else {
			dst0[0] = dst0[1] = dst0[2] = dst2[0] = dst2[1] = dst2[2] = dst1[0] = dst1[1] = dst1[2] = src1[1];
		}

		++src0;
		++src1;
		++src2;
		dst0 += 3;
		dst2 += 3;
		dst1 += 3;
	}

	dst0 += dstPitch;
	dst1 += dstPitch;
	dst2 += dstPitch;
	src0 += 2;
	src1 += 2;
	src2 += 2;
	}
}


			  //

#define INTERP_16_HNMASK interp_highnot_mask
#define INTERP_Y_LIMIT (0x30 * 4)
#define INTERP_U_LIMIT (0x07 * 4)
#define INTERP_V_LIMIT (0x06 * 8)
unsigned interp_highnot_mask;
unsigned interp_red_mask, interp_green_mask, interp_blue_mask;

u16 interp_16_11(u16 p1, u16 p2)
{
	/*
	 * This function compute (a + b) / 2 for any rgb nibble, using the
	 * the formula (a + b) / 2 = ((a ^ b) >> 1) + (a & b).
	 * To extend this formula to a serie of packed nibbles the formula is
	 * implemented as (((v0 ^ v1) >> 1) & MASK) + (v0 & v1) where MASK
	 * is used to clear the high bit of all the packed nibbles.
	 */
	return (((p1 ^ p2) >> 1) & INTERP_16_HNMASK) + (p1 & p2);
}

u16 interp_16_211(u16 p1, u16 p2, u16 p3)
{
	return interp_16_11(p1, interp_16_11(p2, p3));
}

u16 interp_16_31(u16 p1, u16 p2)
{
	return interp_16_11(p1, interp_16_11(p1, p2));
}

u16 interp_16_521(u16 p1, u16 p2, u16 p3)
{
	return interp_16_11(p1, interp_16_11(p2, interp_16_11(p1, p3)));
}

u16 interp_16_431(u16 p1, u16 p2, u16 p3)
{
	return interp_16_11(p1, interp_16_11(p2, interp_16_11(p2, p3)));
}

u16 interp_16_53(u16 p1, u16 p2)
{
	return interp_16_11(p1, interp_16_11(p2, interp_16_11(p1, p2)));
}

u16 interp_16_332(u16 p1, u16 p2, u16 p3)
{
	u16 t = interp_16_11(p1, p2);
	return interp_16_11(t, interp_16_11(p3, t));
}

u16 interp_16_611(u16 p1, u16 p2, u16 p3)
{
	return interp_16_11(p1, interp_16_11(p1, interp_16_11(p2, p3)));
}

u16 interp_16_71(u16 p1, u16 p2)
{
	return interp_16_11(p1, interp_16_11(p1, interp_16_11(p1, p2)));
}

int interp_16_diff(u16 p1, u16 p2)
{
	int r, g, b;
	int y, u, v;

//#if 0 /* OSDEF Reference code */
/*	if ((p1 & interp_near_mask) == (p2 & interp_near_mask))
		return 0;

	b = rgb_shift_sign((p1 & interp_blue_mask) - (p2 & interp_blue_mask), interp_blue_shift);
	g = rgb_shift_sign((p1 & interp_green_mask) - (p2 & interp_green_mask), interp_green_shift);
	r = rgb_shift_sign((p1 & interp_red_mask) - (p2 & interp_red_mask), interp_red_shift);
#else
	/* assume standard rgb formats */
	if (p1 == p2)
		return 0;

	if (interp_green_mask == 0x7E0) {
		b = (int)((p1 & 0x1F) - (p2 & 0x1F)) << 3;
		g = (int)((p1 & 0x7E0) - (p2 & 0x7E0)) >> 3;
		r = (int)((p1 & 0xF800) - (p2 & 0xF800)) >> 8;
	} else {
		b = (int)((p1 & 0x1F) - (p2 & 0x1F)) << 3;
		g = (int)((p1 & 0x3E0) - (p2 & 0x3E0)) >> 2;
		r = (int)((p1 & 0x7C00) - (p2 & 0x7C00)) >> 7;
	}
//#endif

	y = r + g + b;

	if (y < -INTERP_Y_LIMIT || y > INTERP_Y_LIMIT)
		return 1;

	u = r - b;

	if (u < -INTERP_U_LIMIT || u > INTERP_U_LIMIT)
		return 1;

	v = -r + 2*g - b;

	if (v < -INTERP_V_LIMIT || v > INTERP_V_LIMIT)
		return 1;

	return 0;
}
//
/***************************************************************************/
/* HQ4x C implementation */

/*
 * This effect is a rewritten implementation of the hq4x effect made by Maxim Stepin
 */

void hq4x_16_def(u16* dst0, u16* dst1, u16* dst2, u16* dst3, u16* src0, u16* src1, u16* src2, u16 count)
{
	unsigned i;

	for(i=0;i<count;++i) {
		unsigned char mask;

		u16 c[9];

		c[1] = src0[0];
		c[4] = src1[0];
		c[7] = src2[0];

		if (i>0) {
			c[0] = src0[-1];
			c[3] = src1[-1];
			c[6] = src2[-1];
		} else {
			c[0] = c[1];
			c[3] = c[4];
			c[6] = c[7];
		}

		if (i<count-1) {
			c[2] = src0[1];
			c[5] = src1[1];
			c[8] = src2[1];
		} else {
			c[2] = c[1];
			c[5] = c[4];
			c[8] = c[7];
		}

		mask = 0;

		if (interp_16_diff(c[0], c[4]))
			mask |= 1 << 0;
		if (interp_16_diff(c[1], c[4]))
			mask |= 1 << 1;
		if (interp_16_diff(c[2], c[4]))
			mask |= 1 << 2;
		if (interp_16_diff(c[3], c[4]))
			mask |= 1 << 3;
		if (interp_16_diff(c[5], c[4]))
			mask |= 1 << 4;
		if (interp_16_diff(c[6], c[4]))
			mask |= 1 << 5;
		if (interp_16_diff(c[7], c[4]))
			mask |= 1 << 6;
		if (interp_16_diff(c[8], c[4]))
			mask |= 1 << 7;

#define P(a, b) dst##b[a]
#define MUR interp_16_diff(c[1], c[5])
#define MDR interp_16_diff(c[5], c[7])
#define MDL interp_16_diff(c[7], c[3])
#define MUL interp_16_diff(c[3], c[1])
#define I1(p0) c[p0]
#define I2(i0, i1, p0, p1) interp_16_##i0##i1(c[p0], c[p1])
#define I3(i0, i1, i2, p0, p1, p2) interp_16_##i0##i1##i2(c[p0], c[p1], c[p2])

		switch (mask) {
		#include "hq4x.dat"
		}

#undef P
#undef MUR
#undef MDR
#undef MDL
#undef MUL
#undef I1
#undef I2
#undef I3

		src0 += 1;
		src1 += 1;
		src2 += 1;
		dst0 += 4;
		dst1 += 4;
		dst2 += 4;
		dst3 += 4;
	}
}

void HQ4x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
			  u8 *dstPtr, u32 dstPitch, int width, int height){
u16* src0 = (u16*) srcPtr;
u16* src1 = (u16*) (srcPtr+srcPitch);
u16* src2 = (u16*) (srcPtr+(srcPitch*2));
u16* dst0 = (u16*) dstPtr;
u16* dst1 = (u16*) (dstPtr+dstPitch);
u16* dst2 = (u16*) (dstPtr+(dstPitch*2));
u16* dst3 = (u16*) (dstPtr+(dstPitch*3));
srcPitch = (srcPitch)>>1;
dstPitch = (dstPitch<<1);

for (int i=0;i<(height);i++){
	hq4x_16_def(dst0,dst1,dst2,dst3,src0,src1,src2,width);
	dst0 += dstPitch;
	dst1 += dstPitch;
	dst2 += dstPitch;
	dst3 += dstPitch;
	src0 += srcPitch;
	src1 += srcPitch;
	src2 += srcPitch;
}
}