//#include <assert.h>

typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;

#ifdef _MSC_VER
typedef unsigned __int64 u64;
#else
typedef unsigned long long u64;
#endif

typedef signed char s8;
typedef signed short s16;
typedef signed int s32;

#ifdef _MSC_VER
typedef signed __int64 s64;
#else
typedef signed long long s64;
#endif


void Simple2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
			  u8 *dstPtr, u32 dstPitch, int width, int height)
{
//	assert((width & 0x3) == 0);

	__asm {
		mov			eax, width;
		mov			edx, srcPitch;
		shr			eax, 2;
		shl			eax, 3;
		sub			edx, eax;
		mov			srcPitch, edx;
		shr			eax, 3;
		mov			width, eax;
		mov			esi, srcPtr;
		mov			edi, dstPtr;
		mov			edx, edi;
		add			edx, dstPitch;
		mov			ecx, height;
		align 4;
label0:
		mov			eax, width;
		align 4;
label1:
		movq		mm0, qword ptr [esi];
		movq		mm1, mm0;
		punpcklwd	mm0, qword ptr [esi];
		punpckhwd	mm1, qword ptr [esi];
		movq		qword ptr [edi], mm0;
		movq		qword ptr [edx], mm0;
		movq		qword ptr [edi + 8], mm1;
		movq		qword ptr [edx + 8], mm1;

		add			esi, 8;
		add			edi, 16;
		add			edx, 16;
		dec			eax;
		jnz			label1;

		add			esi, srcPitch;
		add			edi, dstPitch;
		add			edx, dstPitch;

		dec			ecx;
		jnz			label0;

		;emms;
	}
}
