#include "stdafx.h"
#include "mem.h"

void hle_fill(DWORD* address,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	__asm {
		mov eax,dword ptr [data]
		mov edx,dword ptr [data+4]
		mov ecx,[dlength]
		mov edi,[address]
inner_loop:
		mov [edi],eax
		mov [edi+4],edx
		add edi,8
		dec ecx
		jnz inner_loop
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			*(BYTE*)(address+(dlength*8)+i) = (BYTE)((data >> ((7-i)*8)) & 0xFF);
		}
	}
}

void mmx_copy_1(DWORD* dst,DWORD* src,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm6,[mask]
		movq mm7,[mask]
inner_loop:
		movq mm0,[src]
		pxor mm0,mm7	// not mm0
		movq mm1,[dst]
		pxor mm1,mm6    // not mm1
		pand mm0,mm1
		movq [dst],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = *(BYTE*)(dst+(dlength*8)+i);
			*(BYTE*)(dst+(dlength*8)+i) = !s & !d;
		}
	}
}
void mmx_copy_2(DWORD* dst,DWORD* src,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
inner_loop:
		movq mm0,[esi]
		pxor mm0,mm7	// not mm0
		movq mm1,[edi]
		pand mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = *(BYTE*)(dst+(dlength*8)+i);
			*(BYTE*)(dst+(dlength*8)+i) = !s & d;
		}
	}
}
void mmx_copy_3(DWORD* dst,DWORD* src,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
inner_loop:
		movq mm0,[esi]
		pxor mm0,mm7	// not mm0
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			*(BYTE*)(dst+(dlength*8)+i) = !s;
		}
	}
}
void mmx_copy_4(DWORD* dst,DWORD* src,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
inner_loop:
		movq mm0,[esi]
		movq mm1,[edi]
		pxor mm1,mm7	// not mm1
		pand mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = *(BYTE*)(dst+(dlength*8)+i);
			*(BYTE*)(dst+(dlength*8)+i) = s & !d;
		}
	}
}
void mmx_copy_5(DWORD* dst,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
inner_loop:
		movq mm0,[edi]
		pxor mm0,mm7	// not mm0
		movq [edi],mm0
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE d = *(BYTE*)(dst+(dlength*8)+i);
			*(BYTE*)(dst+(dlength*8)+i) = !d;
		}
	}
}
void mmx_copy_6(DWORD* dst,DWORD* src,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
inner_loop:
		movq mm0,[esi]
		movq mm1,[edi]
		pxor mm0,mm1
		//pxor mm0,mm7	// not mm0
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = *(BYTE*)(dst+(dlength*8)+i);
			*(BYTE*)(dst+(dlength*8)+i) = s ^ d;
		}
	}
}
void mmx_copy_7(DWORD* dst,DWORD* src,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm6,[mask]
		movq mm7,[mask]
inner_loop:
		movq mm0,[esi]
		pxor mm0,mm6
		movq mm1,[edi]
		pxor mm1,mm7
		por mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = *(BYTE*)(dst+(dlength*8)+i);
			*(BYTE*)(dst+(dlength*8)+i) = !s | !d;
		}
	}
}
void mmx_copy_8(DWORD* dst,DWORD* src,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
inner_loop:
		movq mm0,[esi]
		movq mm1,[edi]
		pand mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = *(BYTE*)(dst+(dlength*8)+i);
			*(BYTE*)(dst+(dlength*8)+i) = s & d;
		}
	}
}
void mmx_copy_9(DWORD* dst,DWORD* src,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
inner_loop:
		movq mm0,[esi]
		movq mm1,[edi]
		pxor mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = *(BYTE*)(dst+(dlength*8)+i);
			*(BYTE*)(dst+(dlength*8)+i) = s ^ d;
		}
	}
}
void mmx_copy_11(DWORD* dst,DWORD* src,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
inner_loop:
		movq mm0,[esi]
		pxor mm0,mm7
		movq mm1,[edi]
		por mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = *(BYTE*)(dst+(dlength*8)+i);
			*(BYTE*)(dst+(dlength*8)+i) = !s | d;
		}
	}
}
void mmx_copy_13(DWORD* dst,DWORD* src,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
inner_loop:
		movq mm0,[esi]
		movq mm1,[edi]
		pxor mm1,mm7	// not mm1
		por mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = *(BYTE*)(dst+(dlength*8)+i);
			*(BYTE*)(dst+(dlength*8)+i) = s | !d;
		}
	}
}
void mmx_copy_14(DWORD* dst,DWORD* src,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
inner_loop:
		movq mm0,[esi]
		movq mm1,[edi]
		por mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = *(BYTE*)(dst+(dlength*8)+i);
			*(BYTE*)(dst+(dlength*8)+i) = s | d;
		}
	}
}


void mmx_copy_1_data(DWORD* dst,DWORD* src,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
		movq mm1,[data]
		pxor mm1,mm7	// not mm1
inner_loop:
		movq mm0,[src]
		pxor mm0,mm7	// not mm0
		pand mm0,mm1
		movq [dst],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = (BYTE)(data >> ((7-i)*8));
			*(BYTE*)(dst+(dlength*8)+i) = !s & !d;
		}
	}
}
void mmx_copy_2_data(DWORD* dst,DWORD* src,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
		movq mm1,[data]
inner_loop:
		movq mm0,[esi]
		pxor mm0,mm7	// not mm0
		pand mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = (BYTE)(data >> ((7-i)*8));
			*(BYTE*)(dst+(dlength*8)+i) = !s & d;
		}
	}
}
void mmx_copy_4_data(DWORD* dst,DWORD* src,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
		movq mm1,[data]
inner_loop:
		movq mm0,[esi]
		pxor mm1,mm7	// not mm1
		pand mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = (BYTE)(data >> ((7-i)*8));
			*(BYTE*)(dst+(dlength*8)+i) = s & !d;
		}
	}
}
void mmx_copy_5_data(DWORD* dst,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
		movq mm0,[data]
		pxor mm0,mm7	// not mm0
inner_loop:
		movq [edi],mm0
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE d = (BYTE)(data >> ((7-i)*8));
			*(BYTE*)(dst+(dlength*8)+i) = !d;
		}
	}
}
void mmx_copy_6_data(DWORD* dst,DWORD* src,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
		movq mm1,[data]
inner_loop:
		movq mm0,[esi]
		pxor mm0,mm1
		pxor mm0,mm7	// not mm0
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = (BYTE)(data >> ((7-i)*8));
			*(BYTE*)(dst+(dlength*8)+i) = !(s ^ d);
		}
	}
}
void mmx_copy_7_data(DWORD* dst,DWORD* src,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
		movq mm1,[data]
		pxor mm1,mm7	// not mm1
inner_loop:
		movq mm0,[esi]
		pxor mm0,mm7	// not mm0
		por mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = (BYTE)(data >> ((7-i)*8));
			*(BYTE*)(dst+(dlength*8)+i) = !s | !d;
		}
	}
}
void mmx_copy_8_data(DWORD* dst,DWORD* src,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm1,[data]
inner_loop:
		movq mm0,[esi]
		pand mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = (BYTE)(data >> ((7-i)*8));
			*(BYTE*)(dst+(dlength*8)+i) = s & d;
		}
	}
}
void mmx_copy_9_data(DWORD* dst,DWORD* src,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm1,[data]
inner_loop:
		movq mm0,[esi]
		pxor mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = (BYTE)(data >> ((7-i)*8));
			*(BYTE*)(dst+(dlength*8)+i) = s ^ d;
		}
	}
}
void mmx_copy_10_data(DWORD* dst,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	__asm {
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm0,[data]
inner_loop:
		movq [edi],mm0
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
}
void mmx_copy_11_data(DWORD* dst,DWORD* src,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
		movq mm1,[data]
inner_loop:
		movq mm0,[esi]
		pxor mm0,mm7	// not mm0
		por mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = (BYTE)(data >> ((7-i)*8));
			*(BYTE*)(dst+(dlength*8)+i) = !s | d;
		}
	}
}
void mmx_copy_13_data(DWORD* dst,DWORD* src,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	UINT64 mask = 0xFFFFFFFFFFFFFFFF;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm7,[mask]
		movq mm1,[data]
		pxor mm1,mm7	// not mm1
inner_loop:
		movq mm0,[esi]
		por mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = (BYTE)(data >> ((7-i)*8));
			*(BYTE*)(dst+(dlength*8)+i) = s | !d;
		}
	}
}
void mmx_copy_14_data(DWORD* dst,DWORD* src,UINT64 data,DWORD block_length)
{
	int dlength = block_length / 8;
	int blength = block_length % 8;
	__asm {
		mov esi,[src]
		mov edi,[dst]
		mov ecx,[dlength]
		movq mm1,[data]
inner_loop:
		movq mm0,[esi]
		por mm0,mm1
		movq [edi],mm0
		add esi,8
		add edi,8
		dec ecx
		jnz inner_loop
		emms
	}
	if(blength > 0) {
		for(int i=0;i<blength;i++) {
			BYTE s = *(BYTE*)(src+(dlength*8)+i);
			BYTE d = (BYTE)(data >> ((7-i)*8));
			*(BYTE*)(dst+(dlength*8)+i) = s | d;
		}
	}
}