/* Emulation for HuC6261(descendant of the VCE) and the HuC6272(KING) */

#include "pcfx.h"
#include "v810_cpu.h"
#include "vdc.h"
#include "king.h"
#include "scsicd.h"

/* Layer priorities(at least internally to KING, probably not for the FXVCE):

	4 = Foremost
	1 = Hindmost
	0 = Hidden


	Note:  According to patents, no two layers should have the same priority,
	unless that priority is 0(layer not shown).
	Also, layer settings 5-7 may cause undefined behaviour.
*/

/*	The VCE can accept 8-bit and 16-bit writes.  In 8-bit mode, increments to the
	palette R/W offset occur when the high byte is written.

	8-bit writes to the color palette table are buffered.
*/


// 16 bit YUV format:  upper 8 bits Y, next 4 bits U, lower 4 bits V, transformed to 8-bit U and 8-bit V by adding 0 or 1, who knows.

typedef struct
{
 uint8 AR;
 uint16 priority[2];	/* uint16 0:
			   	bit   3-0: Legacy VDC BG priority?
				bit   7-4: Legacy VDC SPR priority?
				bit  11-8: RAINBOW(MJPEG) priority
			   uint16 1:
				bit   3-0: KING BG0 priority
				bit   7-4: KING BG1 priority
				bit  11-8: KING BG2 priority
				bit 15-12: KING BG3 priority
			*/

 bool odd_field;


 uint16 picture_mode;
 uint32 frame_counter;

 uint16 palette_rw_offset; // Read/write offset
 uint16 palette_rw_latch;

 uint16 palette_offset[4]; // 
			   // BMG1 and BMG 0 in [1](BMG1 in upper 8 bits, BMG0 in lower), BMG2 and 3 in [2]

 uint16 palette_table[512]; // The YUV palette, woohoo!
 uint32 palette_table_cache[512 * 2]; // 24-bit YUV cache for SPEED(HAH), *2 for faster rendering with palette_offset

 uint16 CCR; // Register 0x0D, fixed color register, YUV
 uint16 BLE; // Register 0x0E, cellophane setting register
 uint16 SPBL; // Register 0x0F, Sprite cellophane setting register
 uint16 coefficients[6]; // Cellophane coefficients, YUV, 4-bits each, on the least-significant end(xxxxYYYYUUUUVVVV).
			 // Valid settings: 0(cellophane disabled for layer), 1-8.  9-F are "unsupported".
} fx_vce_t;

fx_vce_t fx_vce;

static INLINE void RedoPaletteCache(int n)
{
 uint32 YUV = fx_vce.palette_table[n];

 fx_vce.palette_table_cache[n] = 
 fx_vce.palette_table_cache[n * 2] = ((YUV & 0xFF00) << 8) | ((YUV & 0xF0) << 8) | ((YUV & 0xF) << 4);

}

typedef struct
{
	uint8 AR;

	uint16 KRAM[2][262144];
	uint32 KRAMRA, KRAMWA;

	uint32 PageSetting;

	uint16 bgmode; // 4 bits each BG: 3333 2222 1111 0000
		       /* Possible settings:
				0x0:  Invalid?
				0x1:  4-color palette, 1 byte for 4 pixels, transparent on entry 0
				0x2:  16-color palette, 1 byte for 2 pixels, transparent on entry 0
				0x3:  256-color palette, 1 byte for 1 pixel, transparent on entry 0
				0x4:  64K color(Y-8, U-4, V-4), 1 halfword for 1 pixel, transparent on Y=0
				0x5:  16M colors(Y-8, Y-8, U-8, V-8, 4 bytes for 2 pixels), transparent on Y=0

				If & 8, enable palette bank mode(only for 4 and 16-colors)???
					BAT format would be PPPPCCCCCCCCCCCC in this mode.
					4 color: 00PPPPnn 16 color: PPPPnnnn, where "n" is the 2 or 4-bit pixel data
			*/
	uint16 priority;

	uint16 BGSize[4];
	uint16 BGBATAddr[4];
	uint16 BGCGAddr[4];
	uint16 BGXScroll[4];
	uint16 BGYScroll[4];

	uint16 ADPCMControl;
	uint16 ADPCMBufferMode[2];
	uint16 ADPCMStartAddress[2];
	uint32 ADPCMEndAddress[2];
	uint32 ADPCMPlayAddress[2]; // In nibbles
	uint16 ADPCMIntermediateAddress[2];


	uint32 RAINBOWKRAMA;
} king_t;

static king_t king;

#define REGSETP(_reg, _data, _msb) { _reg &= 0xFF << (_msb ? 0 : 8); _reg |= _data << (_msb ? 8 : 0); }
#define REGGETP(_reg, _msb) ((_reg >> (_msb ? 8 : 0)) & 0xFF)
//#define REGSETP(_reg, _data, _wb, _bs) { _reg &= 0xFF << (_wb * 8); _reg |= _data << (_wb * 8); }
//#define REGGETP(_reg, _wb, _bs) ((_reg >> (wb * 8)) & 0xFF)

uint8 FXVCE_Read8(uint32 A)
{
 int msb = A & 1;

 switch(A & 0x304)
 {
  // bit  4-0: Register number
  // bit 13-5: Raster/frame counter(22-261 range??)
  // bit 14: In interlace mode and when on odd fields set bit.
  // bit 15: "0" during active screen area, "1" when in h-blank or v-blank
  case 0x300: if(msb)
	       return((fx_vce.frame_counter >> 3) | (fx_vce.odd_field ? 0x40 : 0x00));
	      else
	       return(fx_vce.AR | (fx_vce.frame_counter << 5));
  case 0x304:
	      switch(fx_vce.AR) // No idea which registers are readable, so make them all readable :b
	      {
	        case 0x00: return(REGGETP(fx_vce.picture_mode, msb));
		case 0x01: return(REGGETP(fx_vce.palette_rw_offset, msb));
	        case 0x02: 
			   {
				uint8 ret = REGGETP(fx_vce.palette_rw_latch, msb);
				if(msb) 
				{
				 fx_vce.palette_rw_offset = (fx_vce.palette_rw_offset + 1) & 0x1FF;
				 fx_vce.palette_rw_latch = fx_vce.palette_table[fx_vce.palette_rw_offset];
				}
				return(ret);
			   }
	        case 0x04: return(REGGETP(fx_vce.palette_offset[0], msb));
		case 0x05: return(REGGETP(fx_vce.palette_offset[1], msb));
		case 0x06: return(REGGETP(fx_vce.palette_offset[2], msb));
		case 0x07: return(REGGETP(fx_vce.palette_offset[3], msb));
		case 0x08: return(REGGETP(fx_vce.priority[0], msb));
		case 0x09: return(REGGETP(fx_vce.priority[1], msb));
	      }
	      break;
 
 }

 return(0);
}

void FXVCE_Write8(uint32 A, uint8 V)
{
 int msb = A & 1;

 //printf("Write8: %08x %02x\n", A, V);
 switch(A & 0x304)
 {
  case 0x300: REGSETP(fx_vce.AR, V, msb); fx_vce.AR &= 0x1F; break;
  case 0x304: switch(fx_vce.AR)
	      {
		case 0x00: REGSETP(fx_vce.picture_mode, V, msb); break;
		case 0x01: REGSETP(fx_vce.palette_rw_offset, V, msb); 
			   fx_vce.palette_rw_offset &= 0x1FF;
			   if(msb)
			    fx_vce.palette_rw_latch = fx_vce.palette_table[fx_vce.palette_rw_offset];
			   break;
		case 0x02: REGSETP(fx_vce.palette_rw_latch, V, msb); 
			   if(msb)
			   {
			    fx_vce.palette_table[fx_vce.palette_rw_offset] = fx_vce.palette_rw_latch;
			    RedoPaletteCache(fx_vce.palette_rw_offset);
			    fx_vce.palette_rw_offset = (fx_vce.palette_rw_offset + 1) & 0x1FF;
			   }
			   break;
		case 0x04: REGSETP(fx_vce.palette_offset[0], V, msb); break;
		case 0x05: REGSETP(fx_vce.palette_offset[1], V, msb); break;
		case 0x06: REGSETP(fx_vce.palette_offset[2], V, msb); break;
		case 0x07: REGSETP(fx_vce.palette_offset[3], V, msb); break;
		case 0x08: REGSETP(fx_vce.priority[0], V, msb); break;
		case 0x09: REGSETP(fx_vce.priority[1], V, msb); break;

		case 0x0d: REGSETP(fx_vce.CCR, V, msb); break;
		case 0x0e: REGSETP(fx_vce.BLE, V, msb); break;
		case 0x0f: REGSETP(fx_vce.SPBL, V, msb); break;
		case 0x10: REGSETP(fx_vce.coefficients[0], V, msb); break;
                case 0x11: REGSETP(fx_vce.coefficients[1], V, msb); break;
                case 0x12: REGSETP(fx_vce.coefficients[2], V, msb); break;
                case 0x13: REGSETP(fx_vce.coefficients[3], V, msb); break;
                case 0x14: REGSETP(fx_vce.coefficients[4], V, msb); break;
                case 0x15: REGSETP(fx_vce.coefficients[5], V, msb); break;
	      }
	      break;
 }
 //printf("%04x %04x\n", fx_vce.priority[0], fx_vce.priority[1]);
}

uint32 KING_Read32(uint32 A)
{
 switch(A & 0x707)
 {
  case 0x600: exit(1);
  case 0x604: switch(king.AR)
	      {
		default: printf("Dark32: %08x\n", A); break;
	      }
	      break;
 }		
 return(0);
}

void KING_Write32(uint32 A, uint32 V)
{
 switch(A & 0x707)
 {
  case 0x600: king.AR = V & 0x7F; break;
  case 0x604: 
              //if(king.AR >= 0x50 && king.AR <= 0x5f)
               // printf("KING: %02x %08x\n", king.AR, V);
	      switch(king.AR)
	      {
		default: printf("%02x, W32: %08x: %08x\n", king.AR, A, V); break;
		case 0x41: king.RAINBOWKRAMA = V; break; // Rainbow transfer address
		case 0x0C: king.KRAMRA = V; break;
		case 0x0D: king.KRAMWA = V; break;
		case 0x0E:
                         {
                           unsigned int page = (king.KRAMWA & 0x80000000) ? 1 : 0;

			   //printf("Write32: %d, %08x, %08x\n", page, king.KRAMWA & 0x3FFFF, V);
                           king.KRAM[page][king.KRAMWA & 0x3FFFF] = V & 0xFFFF;
                           king.KRAMWA = (king.KRAMWA &~ 0x3FFFF) | ((king.KRAMWA + ((king.KRAMWA >> 18) & 0x1FF)) & 0x3FFFF);
                           king.KRAM[page][king.KRAMWA & 0x3FFFF] = V >> 16;
                           king.KRAMWA = (king.KRAMWA &~ 0x3FFFF) | ((king.KRAMWA + ((king.KRAMWA >> 18) & 0x1FF)) & 0x3FFFF);
                          }
			break;
                case 0x59: king.ADPCMEndAddress[0] = V; break;
		case 0x5D: king.ADPCMEndAddress[1] = V; break;
	      }
	      break;
 }
}

uint16 KING_Read16(uint32 A)
{
 switch(A & 0x70C)
 {
  case 0x600: return(king.AR); break;
  case 0x602: 
	      {
		uint16 ret = 0;
		ret |= SCSICD_GetIO() ? 0x04 : 0x00;
		ret |= SCSICD_GetCD() ? 0x08 : 0x00;
		ret |= SCSICD_GetMSG() ? 0x10 : 0x00;
		ret |= SCSICD_GetREQ() ? 0x20 : 0x00;
		ret |= SCSICD_GetBSY() ? 0x40 : 0x00;
		return(ret);
 	      }
	      break; // status...
  case 0x604: switch(king.AR)
	      {
		default: printf("Dark: %02x\n", king.AR);
		case 0x00:
			return(SCSICD_GetDB());
		case 0x01:
			return(SCSICD_GetBSY() | (SCSICD_GetSEL() ? 0x4 : 0) | (SCSICD_GetACK() ? 0x10 : 0) | (SCSICD_GetRST() ? 0x80 : 0));
		case 0x0E:
                         {
                           unsigned int page = (king.KRAMRA & 0x80000000) ? 1 : 0;
			   uint16 ret;
			   ret = king.KRAM[page][king.KRAMWA & 0x3FFFF];
                           king.KRAMRA = (king.KRAMRA &~ 0x3FFFF) | ((king.KRAMRA + ((king.KRAMRA >> 18) & 0x1FF)) & 0x3FFFF);
			   return(ret);
                          }
                          break;
		case 0x0F: return(king.PageSetting);
	      }
	      break;
	      
 }

 return(0);
}

void KING_Write16(uint32 A, uint16 V)
{
 switch(A & 0x70C)
 {
  default: printf("BNORK: %08x %04x\n", A, V); break;
  case 0x600: king.AR = V & 0x7F; break;
  case 0x604: 
	      //if(king.AR >= 0x13 && king.AR <= 0x1F)
		//printf("KING: %02x %04x\n", king.AR, V);
	      switch(king.AR)
	      {
		//default: printf("KING: %02x %04x\n", king.AR, V); break;
		case 0x03: SCSICD_SetIO(V & 1);
			   SCSICD_SetCD(V & 2);
			   SCSICD_SetMSG(V & 4);
			   break;
		case 0x0E: 
			  {
			   unsigned int page = (king.KRAMWA & 0x80000000) ? 1 : 0;
			   king.KRAM[page][king.KRAMWA & 0x3FFFF] = V;
			   king.KRAMWA = (king.KRAMWA &~ 0x3FFFF) | ((king.KRAMWA + ((king.KRAMWA >> 18) & 0x1FF)) & 0x3FFFF);
			  }
			  break;
		case 0x0F: king.PageSetting = V; break;
		case 0x10: king.bgmode = V; printf("BGMODE: %04x\n", V); meowpc(); break;
		case 0x12: king.priority = V; break;

		case 0x20: king.BGBATAddr[0] = V; printf("BATADDR: %04x\n", V); break;
		case 0x21: king.BGCGAddr[0] = V; printf("CGADDR: %04x\n", V); break;
		case 0x24: king.BGBATAddr[1] = V & 0x1FF; break;
		case 0x25: king.BGCGAddr[1] = V & 0x1FF; break;
		case 0x28: king.BGBATAddr[2] = V & 0x1FF; break;
		case 0x29: king.BGCGAddr[2] = V & 0x1FF; break;
		case 0x2A: king.BGBATAddr[3] = V & 0x1FF; break;
		case 0x2B: king.BGCGAddr[3] = V & 0x1FF; break;

		case 0x2C: king.BGSize[0] = V; printf("BGSize0: %04x\n", V); break;
		case 0x2D: king.BGSize[1] = V; printf("BGSize1: %04x\n", V); break;
		case 0x2E: king.BGSize[2] = V; printf("BGSize2: %04x\n", V); break;
		case 0x2F: king.BGSize[3] = V; printf("BGSize3: %04x\n", V); break;

		case 0x30: king.BGXScroll[0] = V; printf("BGXScroll: %04x\n", V); break;
		case 0x31: king.BGYScroll[0] = V; printf("BGYScroll: %04x\n", V); break;

		case 0x32: king.BGXScroll[1] = V; break;
		case 0x33: king.BGYScroll[1] = V; break;

		case 0x34: king.BGXScroll[2] = V; break;
		case 0x35: king.BGYScroll[2] = V; break;

		case 0x36: king.BGXScroll[3] = V; break;
		case 0x37: king.BGYScroll[3] = V; break;

		case 0x50: 
			   if(!(king.ADPCMControl & 1) && (V & 1))
			   {
			    king.ADPCMPlayAddress[0] = king.ADPCMStartAddress[0] * 256 * 4; // 4 nibbles for 16 bits
			   }
                           if(!(king.ADPCMControl & 2) && (V & 2))
                           {
                            king.ADPCMPlayAddress[1] = king.ADPCMStartAddress[1] * 256 * 4; // 4 nibbles for 16 bits
                           }
			   king.ADPCMControl = V; 
			   break;
		case 0x51: king.ADPCMBufferMode[0] = V; break;
		case 0x52: king.ADPCMBufferMode[1] = V; break;

		case 0x58: king.ADPCMStartAddress[0] = V; break;
		//case 0x59: king.ADPCMEndAddress[0] = V; break;
		case 0x5A: king.ADPCMIntermediateAddress[0] = V; break;
		
		case 0x5C: king.ADPCMStartAddress[1] = V; break;
		//case 0x5D: king.ADPCMEndAddress[1] = V; break;
		case 0x5E: king.ADPCMIntermediateAddress[1] = V; break;
	      }
	      break;

 }
}

uint32 KING_GetADPCMControl()
{
 return(king.ADPCMControl);
}

uint8 KING_GetADPCMNibble(int ch)
{
 int page = (king.PageSetting & (1 << 24)) ? 1 : 0;
 uint16 ret = king.KRAM[page][king.ADPCMPlayAddress[ch] >> 2];

 //if((king.ADPCMPlayAddress[ch] & 2))
 // ret >>= 8;

 //if((king.ADPCMPlayAddress[ch] & 1))
 // ret >>= 4;
 ret >>= (king.ADPCMPlayAddress[ch] & 3) * 4;
 ret &= 0xF;

 king.ADPCMPlayAddress[ch]++;
 if((king.ADPCMPlayAddress[ch] >> 2) == king.ADPCMEndAddress[ch])
 {
  printf("End: %d\n", ch);
  king.ADPCMControl &= ~(1 << ch);
 }
 return(ret);
}

bool KING_Init(void)
{

}

// Transparency macros
#define TRANS_OR (0x1 << 24)
#define TTRANS(_testvar) ((_testvar) ? 0 : TRANS_OR)



static const unsigned int ClockModeWidths[2] = { 288, 384 };
static void DrawBG(uint32 *target, int n)
{
 uint16 bgmode = (king.bgmode >> (n * 4)) & 0xF;
 uint32 XScroll = king.BGXScroll[n];
 uint32 YScroll = king.BGYScroll[n];
 uint32 bat_offset = king.BGBATAddr[n] * 1024;
 uint32 cg_offset = king.BGCGAddr[n] * 1024;
 uint32 bat_and_cg_bank = (king.PageSetting & 0x100) ? 1 : 0;

 unsigned int width = (fx_vce.picture_mode & 0x08) ? 320 : 256;

 int start = (ClockModeWidths[(fx_vce.picture_mode & 0x08) ? 1 : 0] - width) / 2;
 int end = start + width;
 int bat_width, bat_width_mask, bat_width_shift;
 int bat_height, bat_height_mask, bat_height_shift;

 bat_width_shift = (king.BGSize[n] & 0xF0) >> 4;
 if(bat_width_shift < 3) bat_width_shift = 3;
 if(bat_width_shift > 9) bat_width_shift = 9;

 bat_width = (1 << bat_width_shift) >> 3;
 bat_width_mask = bat_width - 1;

 bat_height_shift = king.BGSize[n] & 0x0F;
 if(bat_height_shift < 3) bat_height_shift = 3;
 if(bat_height_shift > 9) bat_height_shift = 9;

 bat_height = (1 << bat_height_shift) >> 3;
 bat_height_mask = bat_height - 1;


 uint32 YOffset = YScroll + (fx_vce.frame_counter - 14);

 int first_end = start - (XScroll & 0x7);
 int bat_y = ((YOffset >> 3) & bat_height_mask) << bat_width_shift;
 int bat_x = (XScroll >> 3) & bat_width_mask;
 int ysmall = YOffset & 0x7;

 uint32 palette_offset = fx_vce.palette_offset[1 + (n >> 1)] >> ((n & 1) ? 8 : 0);
 palette_offset <<= 1;
 palette_offset &= 0x1FF;
 uint32 *palette_ptr = &fx_vce.palette_table_cache[palette_offset];

 if(!(bgmode & 0x8))
 {
  // Access the CG directly.  Yay :/
  //printf("%08x %08x %d %02x %02x\n", bat_offset, cg_offset, bat_and_cg_bank, bat_width_mask, bat_height_mask); //40000
  switch(bgmode & 0x7)
  {
   case 0x00: for(int x = first_end; x < end; x++) target[x] = TRANS_OR | (n << 28); break;
   case 0x05:
	for(int x = first_end; x < end; x+=8)
	{
	uint16 *cgptr = &king.KRAM[bat_and_cg_bank][(cg_offset + (bat_x * 8) + (YOffset & 63) * 64) & 262143];
	target[x + 0] = TTRANS(cgptr[0] >> 8) | ((cgptr[0x0] & 0xFF00) << 8) | (cgptr[1] & 0xFF00) | (cgptr[1] & 0xFF) | (n << 28);
	target[x + 1] = TTRANS(cgptr[0] >> 8) | ((cgptr[0x0] & 0x00FF) << 16) | (cgptr[1] & 0xFF00) | (cgptr[1] & 0xFF) | (n << 28);

	target[x + 2] = TTRANS(cgptr[2] >> 8) | ((cgptr[0x2] & 0xFF00) << 8) | (cgptr[3] & 0xFF00) | (cgptr[3] & 0xFF) | (n << 28);
	target[x + 3] = TTRANS(cgptr[2] >> 8) | ((cgptr[0x2] & 0x00FF) << 16) | (cgptr[3] & 0xFF00) | (cgptr[3] & 0xFF) | (n << 28);

	target[x + 4] = TTRANS(cgptr[4] >> 8) | ((cgptr[0x4] & 0xFF00) << 8) | (cgptr[5] & 0xFF00) | (cgptr[5] & 0xFF) | (n << 28);
	target[x + 5] = TTRANS(cgptr[4] >> 8) | ((cgptr[0x4] & 0x00FF) << 16) | (cgptr[5] & 0xFF00) | (cgptr[5] & 0xFF) | (n << 28);

	target[x + 6] = TTRANS(cgptr[6] >> 8) | ((cgptr[0x6] & 0xFF00) << 8) | (cgptr[7] & 0xFF00) | (cgptr[7] & 0xFF) | (n << 28);
	target[x + 7] = TTRANS(cgptr[6] >> 8) | ((cgptr[0x6] & 0x00FF) << 16) | (cgptr[7] & 0xFF00) | (cgptr[7] & 0xFF) | (n << 28);

	bat_x = (bat_x + 1) & bat_width_mask;
	}
	break;
  }
 }
 else
 for(int x = first_end; x < end; x+=8)
 {
  uint16 bat;
  uint32 batfoo = (bat_offset + (bat_x + bat_y)) & 262143;

  bat = king.KRAM[bat_and_cg_bank][batfoo];

  switch(bgmode & 0x7)
  {
    case 0x01: // 4 color
    {
      uint8 pbn = (bat >> 12) << 2;
      bat &= 0x0FFF;

      uint16 cg = king.KRAM[bat_and_cg_bank][(cg_offset + (bat * 8) + ysmall) & 262143];

      target[x + 0] = TTRANS((cg >> 0) & 0x3) | palette_ptr[pbn | ((cg >> 0) & 0x3)] | (n << 28);
      target[x + 1] = TTRANS((cg >> 2) & 0x3) | palette_ptr[pbn | ((cg >> 2) & 0x3)] | (n << 28);
      target[x + 2] = TTRANS((cg >> 4) & 0x3) | palette_ptr[pbn | ((cg >> 4) & 0x3)] | (n << 28);
      target[x + 3] = TTRANS((cg >> 6) & 0x3) | palette_ptr[pbn | ((cg >> 6) & 0x3)] | (n << 28);

      target[x + 4] = TTRANS((cg >> 8) & 0x3) | palette_ptr[pbn | ((cg >> 8) & 0x3)] | (n << 28);
      target[x + 5] = TTRANS((cg >> 10) & 0x3) | palette_ptr[pbn | ((cg >> 10) & 0x3)] | (n << 28);
      target[x + 6] = TTRANS((cg >> 12) & 0x3) | palette_ptr[pbn | ((cg >> 12) & 0x3)] | (n << 28);
      target[x + 7] = TTRANS((cg >> 14) & 0x3) | palette_ptr[pbn | ((cg >> 14) & 0x3)] | (n << 28);
     }
     break;
    case 0x02: // 16 color
     {
      uint8 pbn = (bat >> 12) << 4;
      bat &= 0x0FFF;

      uint16 *cgptr = &king.KRAM[bat_and_cg_bank][(cg_offset + (bat * 16) + ysmall * 2) & 262143];

	target[x + 0] = TTRANS((cgptr[0] >> 0) & 0xF) | palette_ptr[pbn | ((cgptr[0] >> 0) & 0xF)] | (n << 28);
	target[x + 1] = TTRANS((cgptr[0] >> 4) & 0xF) | palette_ptr[pbn | ((cgptr[0] >> 4) & 0xF)] | (n << 28);
	target[x + 2] = TTRANS((cgptr[0] >> 8) & 0xF) | palette_ptr[pbn | ((cgptr[0] >> 8) & 0xF)] | (n << 28);
	target[x + 3] = TTRANS((cgptr[0] >> 12) & 0xF) | palette_ptr[pbn | ((cgptr[0] >> 12) & 0xF)] | (n << 28);
	target[x + 4] = TTRANS((cgptr[1] >> 0) & 0xF) | palette_ptr[pbn | ((cgptr[1] >> 0) & 0xF)] | (n << 28);
	target[x + 5] = TTRANS((cgptr[1] >> 4) & 0xF) | palette_ptr[pbn | ((cgptr[1] >> 4) & 0xF)] | (n << 28);
	target[x + 6] = TTRANS((cgptr[1] >> 8) & 0xF) | palette_ptr[pbn | ((cgptr[1] >> 8) & 0xF)] | (n << 28);
	target[x + 7] = TTRANS((cgptr[1] >> 12) & 0xF) | palette_ptr[pbn | ((cgptr[1] >> 12) & 0xF)] | (n << 28);
     }
     break;
    case 0x03: // 256 color
     {
      uint16 *cgptr = &king.KRAM[bat_and_cg_bank][(cg_offset + (bat * 32) + ysmall * 4) & 262143];

	target[x + 0] = TTRANS((cgptr[0] >> 0) & 0xFF) | palette_ptr[(cgptr[0] >> 0) & 0xFF] | (n << 28);
	target[x + 1] = TTRANS((cgptr[0] >> 8) & 0xFF) | palette_ptr[(cgptr[0] >> 8) & 0xFF] | (n << 28);
	target[x + 2] = TTRANS((cgptr[1] >> 0) & 0xFF) | palette_ptr[(cgptr[1] >> 0) & 0xFF] | (n << 28);
	target[x + 3] = TTRANS((cgptr[1] >> 8) & 0xFF) | palette_ptr[(cgptr[1] >> 8) & 0xFF] | (n << 28);
	target[x + 4] = TTRANS((cgptr[2] >> 0) & 0xFF) | palette_ptr[(cgptr[2] >> 0) & 0xFF] | (n << 28);
	target[x + 5] = TTRANS((cgptr[2] >> 8) & 0xFF) | palette_ptr[(cgptr[2] >> 8) & 0xFF] | (n << 28);
	target[x + 6] = TTRANS((cgptr[3] >> 0) & 0xFF) | palette_ptr[(cgptr[3] >> 0) & 0xFF] | (n << 28);
	target[x + 7] = TTRANS((cgptr[3] >> 8) & 0xFF) | palette_ptr[(cgptr[3] >> 8) & 0xFF] | (n << 28);
     }
     break;
    case 0x04: // 64K color, YUV, (bit 15)YYYYYYYY UUUUVVVV(bit 0)
     { 
      uint16 *cgptr = &king.KRAM[bat_and_cg_bank][(cg_offset + (bat * 64) + ysmall * 8) & 262143];

	target[x + 0] = TTRANS(cgptr[0] >> 8) | ((cgptr[0] & 0xFF00) << 8) | ((cgptr[0] & 0xF0) << 8) | ((cgptr[0] & 0x0F) << 4) | (n << 28);
	target[x + 1] = TTRANS(cgptr[1] >> 8) | ((cgptr[1] & 0xFF00) << 8) | ((cgptr[1] & 0xF0) << 8) | ((cgptr[1] & 0x0F) << 4) | (n << 28);
	target[x + 2] = TTRANS(cgptr[2] >> 8) | ((cgptr[2] & 0xFF00) << 8) | ((cgptr[2] & 0xF0) << 8) | ((cgptr[2] & 0x0F) << 4) | (n << 28);
	target[x + 3] = TTRANS(cgptr[3] >> 8) | ((cgptr[3] & 0xFF00) << 8) | ((cgptr[3] & 0xF0) << 8) | ((cgptr[3] & 0x0F) << 4) | (n << 28);
	target[x + 4] = TTRANS(cgptr[4] >> 8) | ((cgptr[4] & 0xFF00) << 8) | ((cgptr[4] & 0xF0) << 8) | ((cgptr[4] & 0x0F) << 4) | (n << 28);
	target[x + 5] = TTRANS(cgptr[5] >> 8) | ((cgptr[5] & 0xFF00) << 8) | ((cgptr[5] & 0xF0) << 8) | ((cgptr[5] & 0x0F) << 4) | (n << 28);
	target[x + 6] = TTRANS(cgptr[6] >> 8) | ((cgptr[6] & 0xFF00) << 8) | ((cgptr[6] & 0xF0) << 8) | ((cgptr[6] & 0x0F) << 4) | (n << 28);
	target[x + 7] = TTRANS(cgptr[7] >> 8) | ((cgptr[7] & 0xFF00) << 8) | ((cgptr[7] & 0xF0) << 8) | ((cgptr[7] & 0x0F) << 4) | (n << 28);
     }
     break;
   case 0x05: // 16M color
     {
	uint16 *cgptr = &king.KRAM[bat_and_cg_bank][(cg_offset + (bat * 64) + ysmall * 8) & 262143];

      target[x + 0] = TTRANS(cgptr[0] >> 8) | ((cgptr[0x0] & 0xFF00) << 8) | (cgptr[1] & 0xFF00) | (cgptr[1] & 0xFF) | (n << 28);
      target[x + 1] = TTRANS(cgptr[0] >> 8) | ((cgptr[0x0] & 0x00FF) << 16) | (cgptr[1] & 0xFF00) | (cgptr[1] & 0xFF) | (n << 28);
 
      target[x + 2] = TTRANS(cgptr[2] >> 8) | ((cgptr[0x2] & 0xFF00) << 8) | (cgptr[3] & 0xFF00) | (cgptr[3] & 0xFF) | (n << 28);
      target[x + 3] = TTRANS(cgptr[2] >> 8) | ((cgptr[0x2] & 0x00FF) << 16) | (cgptr[3] & 0xFF00) | (cgptr[3] & 0xFF) | (n << 28);

      target[x + 4] = TTRANS(cgptr[4] >> 8) | ((cgptr[0x4] & 0xFF00) << 8) | (cgptr[5] & 0xFF00) | (cgptr[5] & 0xFF) | (n << 28);
      target[x + 5] = TTRANS(cgptr[4] >> 8) | ((cgptr[0x4] & 0x00FF) << 16) | (cgptr[5] & 0xFF00) | (cgptr[5] & 0xFF) | (n << 28);

      target[x + 6] = TTRANS(cgptr[6] >> 8) | ((cgptr[0x6] & 0xFF00) << 8) | (cgptr[7] & 0xFF00) | (cgptr[7] & 0xFF) | (n << 28);
      target[x + 7] = TTRANS(cgptr[6] >> 8) | ((cgptr[0x6] & 0x00FF) << 16) | (cgptr[7] & 0xFF00) | (cgptr[7] & 0xFF) | (n << 28);

     }
     break;
  }
  bat_x = (bat_x + 1) & bat_width_mask;
 }
}

static uint32 INLINE YUV888_TO_RGB888(uint32 yuv)
{
 unsigned int r, g, b;
 int y = (int)((yuv >> 16) & 0xFF);
 int u = (int)((yuv >> 8)  & 0xFF) - 128;
 int v = (int)(yuv & 0xFF) - 128;

 r = y + 1.402 * v;
 g = y - 0.34414 * u - 0.71414 * v;
 b = y + 1.772 * u;

 if(r > 255) r = 255;
 if(r < 0) r = 0;
 if(g > 255) g = 255;
 if(g < 0) g = 0;
 if(b > 255) b = 255;
 if(b < 0) b = 0;

 return((r << FSettings.rshift) | (g << FSettings.gshift) | (b << FSettings.bshift));
}


void KING_RunFrame(fx_vdc_t **vdc_chips, uint32 *pXBuf, MDFN_Rect *LineWidths, int skip)
{

 for(fx_vce.frame_counter = 0; fx_vce.frame_counter < 263; fx_vce.frame_counter++)
 {
  uint32 vdc_linebuffer[2][512];
  uint32 bg_linebuffer[512];
  unsigned int width = (fx_vce.picture_mode & 0x08) ? 320 : 256;
  int start = (ClockModeWidths[(fx_vce.picture_mode & 0x08) ? 1 : 0] - width) / 2;
  int end = start + width;

  uint32 *vdc_palette_ptr[2];

  // BG:
  vdc_palette_ptr[0] = &fx_vce.palette_table_cache[(fx_vce.palette_offset[0] & 0xFF) << 1];
  // SPR:
  vdc_palette_ptr[1] = &fx_vce.palette_table_cache[(fx_vce.palette_offset[0] & 0xFF) << 1];

  FXVDC_DoLine(vdc_chips[0], fx_vce.frame_counter, vdc_linebuffer[0], skip);
  FXVDC_DoLine(vdc_chips[1], fx_vce.frame_counter, vdc_linebuffer[1], skip);

  if(fx_vce.frame_counter >= 14 && fx_vce.frame_counter < 262)
  {
   uint32 bg_linebuffers[4][512];
   uint32 *rearranged[4];
   int rearr = 0;

   MDFNGameInfo->DisplayRect.w = width;
   MDFNGameInfo->DisplayRect.x = start;
   LineWidths[fx_vce.frame_counter - 14] = MDFNGameInfo->DisplayRect;

   for(int x = start; x < end; x++)
   {
    uint32 zort[2];

    zort[0] = vdc_linebuffer[0][x];
    zort[1] = vdc_linebuffer[1][x];

    zort[0] = vdc_palette_ptr[((zort[0] >> 8) & 1)][zort[0] & 0xFF] | TTRANS(zort[0] & 0xF) | ((zort[0] & 0x100) ? 5 << 28 : 4 << 28);
    zort[1] = vdc_palette_ptr[((zort[1] >> 8) & 1)][zort[1] & 0xFF] | TTRANS(zort[1] & 0xF) | ((zort[1] & 0x100) ? 5 << 28 : 4 << 28);

    if(!(zort[1] & TRANS_OR))
     zort[0] = zort[1];

    vdc_linebuffer[0][x] = zort[0];
   }

   DrawBG(bg_linebuffers[0], 0);
   DrawBG(bg_linebuffers[1], 1);
   DrawBG(bg_linebuffers[2], 2);
   DrawBG(bg_linebuffers[3], 3);

   /*
        4 = Foremost
        1 = Hindmost
        0 = Hidden
   */

   for(int x = 0; x < 4; x++)
   {
    int thisprio = (king.priority >> (x * 3)) & 0x7;
    rearranged[rearr++] = bg_linebuffers[x];
   }

   // Fill out the rest with the final layer we had so we don't dereference bad pointies
   for(int x = rearr + 1; x < 4; x++)
    rearranged[x] = rearranged[rearr];
   
   if(!rearr) // No layers on, HMM.
    for(int x = start; x < end; x++)
    {
     bg_linebuffer[x] = TRANS_OR;
    }
   else
    for(int x = start; x < end; x++)
    {
     uint32 replacement = rearranged[0][x];

     if(!(rearranged[1][x] & TRANS_OR))
      replacement = rearranged[1][x];
     if(!(rearranged[2][x] & TRANS_OR))
      replacement = rearranged[2][x];
     if(!(rearranged[3][x] & TRANS_OR))
      replacement = rearranged[3][x];

     bg_linebuffer[x] = replacement;
    }

  // Now we have to mix everything together... I'm scared, mommy.
  // We have, vdc_linebuffer[0] and bg_linebuffer
  // Which layer is specified in bits 28-31:
  //  0 : BG0, 1: BG1, 2: BG2, 3: BG3
  //  4 : VDC BG, 5: VDC SPR
   uint32 priority_remap[6];

   for(int x = 0; x < 4; x++)
   {
    priority_remap[x] = (fx_vce.priority[1] >> (x * 4)) & 0xF;
    //printf("RM: %d %d\n", x, priority_remap[x]);
   }
   priority_remap[5] = (fx_vce.priority[0] & 0xF);
   priority_remap[6] = ((fx_vce.priority[0] >> 4) & 0xF);

   uint32 *target = pXBuf + (MDFNGameInfo->pitch >> 2) * (fx_vce.frame_counter - 14);
   for(int x = start; x < end; x++)
   {
    uint32 bg_pixel = bg_linebuffer[x];
    uint32 vdc_pixel = vdc_linebuffer[0][x];

    uint32 bg_prio = priority_remap[bg_pixel >> 28];
    uint32 vdc_prio = priority_remap[vdc_pixel >> 28];
    uint32 zeout = 0;

    if(bg_prio > vdc_prio)
    {
     if((bg_pixel & TRANS_OR) && vdc_prio)
      zeout = vdc_pixel;
     else
      zeout = bg_pixel;
    }
    else if(vdc_prio > bg_prio)
    {
     if((vdc_pixel & TRANS_OR) && bg_prio)
      zeout = bg_pixel;
     else
      zeout = vdc_pixel;
    }

    target[x] = YUV888_TO_RGB888(zeout);
   }
  }
  v810_run(1365);
 }
}

void KING_SetPixelFormat(int rshift, int gshift, int bshift)
{
 #ifdef MOO
 int x;
 int used[4] = {0, 0, 0, 0};

 used[rshift >> 3] = 1;
 used[gshift >> 3] = 1;
 used[bshift >> 3] = 1;
 for(x = 0; x < 4; x++)
  if(!used[x])
  {
   amask = 1 << (x << 3);
   amask_shift = (x << 3);
  }
 for(x=0;x<512;x++)
 {
  int b = (x & 0x007);
  int r = (x & 0x038) >> 3;
  int g = (x & 0x1c0) >> 6;

  systemColorMap32[x] = ((r * 36) << rshift) + ((g * 36) << gshift) + ((b * 36) << bshift);

  int lum = (int)((r + g + b*.25) * 36 / 2.25);
  bw_systemColorMap32[x] = (lum << rshift) + (lum << gshift) + (lum << bshift);
 }

 // I know the temptation is there, but don't combine these two loops just
 // because they loop 512 times ;)
 for(x = 0; x < 512; x++)
  FixPCache(x);
#endif
}

