/* Mednafen - Multi-system Emulator
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/* VDC and VCE emulation */

/*
"Tonight I hooked up my Turbo Duo(with no games or CDs in it)'s video output to my PC sound card, recorded it, 
and did a FFT and looked at the spectrum(around the line rate, 15-16KHz), and I also counted the number 
of samples between the ~60Hz peaks(just to verify that the math shown below is remotely accurate).

The spectrum peaked at 15734 Hz.  21477272.727272... / 3 / 15734 = 455.00(CPU cycles per scanline)"
*/

#include <math.h>
#include "pce.h"
#include "../video.h"
#include "vdc.h"
#include "huc.h"
#include "adpcm.h"

static uint32 systemColorMap32[512], bw_systemColorMap32[512];
static uint32 amask;    // Alpha channel maskaroo
static uint32 amask_shift;
static uint32 userle; // User layer enable.
static bool unlimited_sprites;

int g_use16 = 0 ;

#define ULE_BG0		1
#define ULE_SPR0	2
#define ULE_BG1		4
#define ULE_SPR1	8

typedef struct
{
	bool8 lc263;	// 263 line count if set, 262 if not
	bool8 bw;	// Black and White
	uint8 dot_clock; // Dot Clock(5, 7, or 10 MHz = 0, 1, 2)
	uint16 color_table[0x200];
	uint32 color_table_cache[0x200];
	uint16 ctaddress;
} vce_t;

static vce_t vce;

static INLINE void FixPCache(int entry)
{
 uint32 *cm32 = vce.bw ? bw_systemColorMap32 : systemColorMap32;

 if(!(entry & 0xFF))
 {
  for(int x = 0; x < 16; x++)
   vce.color_table_cache[(entry & 0x100) + (x << 4)] = cm32[vce.color_table[entry & 0x100]] | amask;
 }
 if(!(entry & 0xF))
  return;

 vce.color_table_cache[entry] = cm32[vce.color_table[entry]];
}

typedef struct
{
	uint32 display_counter;

	int32 sat_dma_slcounter;

	uint8 select;
	uint16 MAWR;	// Memory Address Write Register
	uint16 MARR;	// Memory Address Read Register
	uint16 CR;	// Control Register
	uint16 RCR;	// Raster Compare Register
	uint16 BXR;	// Background X-Scroll Register
	uint16 BYR;	// Background Y-Scroll Register
	uint16 MWR;	// Memory Width Register

	uint16 HSR;	// Horizontal Sync Register
	uint16 HDR;	// Horizontal Display Register
	uint16 VSR;
	uint16 VDR;

	uint16 VCR;
	uint16 DCR;
	uint16 SOUR;
	uint16 DESR;
	uint16 LENR;
	uint16 SATB;

	uint8 sgx_priority;

	uint32 RCRCount;

	uint16 read_buffer;
	uint8 write_latch;
	uint8 status;

	uint16 SAT[0x100];
	uint16 VRAM[32768];
	uint8 bg_tile_cache[2048][8][8] ; //__attribute__ ((aligned (8))); // Tile, y, x
	
	uint16 DMAReadBuffer;
	bool8 DMAReadWrite;
	bool8 DMARunning;
	bool8 SATBPending;
	bool8 burst_mode;

	uint32 BG_YOffset;	// Reloaded from BYR at start of display area?
	uint32 BG_XOffset;	// Reloaded from BXR at each scanline, methinks.
} vdc_t;

static vdc_t vdc_chips[2];

static vdc_t *vdc;

typedef struct
{
	uint8 priority[2];
	uint16 winwidths[2];
	uint8 st_mode;
} vpc_t;

static vpc_t vpc;

static int sgfx = 0;

// Some virtual vdc macros to make code simpler to read
#define M_vdc_HSW	(vdc->HSR & 0x1F)	// Horizontal Synchro Width
#define M_vdc_HDS	((vdc->HSR >> 8) & 0x7F) // Horizontal Display Start
#define M_vdc_HDW	(vdc->HDR & 0x7F)	// Horizontal Display Width
#define M_vdc_HDE	((vdc->HDR >> 8) & 0x7F) // Horizontal Display End

#define M_vdc_VSW	(vdc->VSR & 0x1F)	// Vertical synchro width
#define M_vdc_VDS	((vdc->VSR >> 8) & 0xFF) // Vertical Display Start
#define M_vdc_VDW	(vdc->VDR & 0x1FF)	// Vertical Display Width(Height? :b)
#define M_vdc_VCR	(vdc->VCR & 0xFF)

static const unsigned int vram_inc_tab[4] = { 1, 32, 64, 128 };

#define VDCS_CR		0x01 // Sprite #0 collision interrupt occurred
#define VDCS_OR		0x02 // sprite overflow "" ""
#define VDCS_RR		0x04 // RCR             ""  ""
#define VDCS_DS		0x08 // VRAM to SAT DMA completion interrupt occurred
#define VDCS_DV		0x10 // VRAM to VRAM DMA completion interrupt occurred
#define VDCS_VD		0x20 // Vertical blank interrupt occurred
#define VDCS_BSY	0x40 // VDC is waiting for a CPU access slot during the active display area??

static INLINE void FixTileCache(uint16 A)
{
 uint32 charname = (A >> 4);
 uint32 y = (A & 0x7);
 uint8 *tc = vdc->bg_tile_cache[charname][y];

 uint32 bitplane01 = vdc->VRAM[y + charname * 16];
 uint32 bitplane23 = vdc->VRAM[y+ 8 + charname * 16];

 for(int x = 0; x < 8; x++)
 {
  uint32 raw_pixel = ((bitplane01 >> x) & 1);
  raw_pixel |= ((bitplane01 >> (x + 8)) & 1) << 1;
  raw_pixel |= ((bitplane23 >> x) & 1) << 2;
  raw_pixel |= ((bitplane23 >> (x + 8)) & 1) << 3;
  tc[7 - x] = raw_pixel;
 }
/*
  uint32 bitplane01 = vdc->VRAM[tiny_YOffset + charname * 16];
  uint32 bitplane23 = vdc->VRAM[tiny_YOffset + 8 + charname * 16];
  uint32 raw_pixel;
  unsigned int tiny_XOffset = 7 - (vdc->BG_XOffset & 7);

  raw_pixel = ((bitplane01 >> tiny_XOffset) & 1);
  raw_pixel |= ((bitplane01 >> (tiny_XOffset + 8)) & 1) << 1;
  raw_pixel |= ((bitplane23 >> tiny_XOffset) & 1) << 2;
  raw_pixel |= ((bitplane23 >> (tiny_XOffset + 8)) & 1) << 3;
 */
}

void VDC_SetPixelFormat(int rshift, int gshift, int bshift)
{
 int x;
 int used[4] = {0, 0, 0, 0};

 used[rshift >> 3] = 1;
 used[gshift >> 3] = 1;
 used[bshift >> 3] = 1;
 for(x = 0; x < 4; x++)
  if(!used[x])
  {
   amask = 1 << 24 ; //(x << 3);
   amask_shift = 24 ; //(x << 3);
  }
 for(x=0;x<512;x++)
 {
  int b = (x & 0x007);
  int r = (x & 0x038) >> 3;
  int g = (x & 0x1c0) >> 6;

  if ( g_use16 )
  {
	systemColorMap32[x] = ( ((r * 36)>>2) << rshift) + (((g * 36)>>1) << gshift) + (((b * 36)>>2) << bshift);
  }
  else
  {
	systemColorMap32[x] = ((r * 36) << rshift) + ((g * 36) << gshift) + ((b * 36) << bshift);
  }

  int lum = (int)((r + g + b*.25) * 36 / 2.25);
  if ( g_use16 )
  {
	bw_systemColorMap32[x] = ((lum>>2) << rshift) + ((lum>>1) << gshift) + ((lum>>2) << bshift);
  }
  else
  {
	bw_systemColorMap32[x] = (lum << rshift) + (lum << gshift) + (lum << bshift);
  }
 }

 // I know the temptation is there, but don't combine these two loops just
 // because they loop 512 times ;)
 for(x = 0; x < 512; x++)
  FixPCache(x);
}

DECLFR(VCE_Read)
{
 switch(A & 0x7)
 {
  case 4: return(vce.color_table[vce.ctaddress & 0x1FF]);
  case 5: {
	   uint8 ret = vce.color_table[vce.ctaddress & 0x1FF] >> 8;
	   ret &= 1;
	   ret |= 0xFE;
	   vce.ctaddress++;
	   return(ret);
	 }
 }
 return(0xFF);
}

DECLFW(VCE_Write)
{
 //printf("%04x %02x, %04x\n", A, V, HuCPU.PC);
 switch(A&0x7)
 {
  case 0: if(((V & 0x80) >> 7) != vce.bw)
	  {
           vce.bw = V & 0x80;
	   for(int x = 0; x < 512; x++)
            FixPCache(x); 
	  }
          vce.lc263 = (V & 0x04);
	  vce.dot_clock = V & 1; 
	  if(V & 2) 
	   vce.dot_clock = 2; 
	  break;
  case 2: vce.ctaddress &= 0x100; vce.ctaddress |= V; break;
  case 3: vce.ctaddress &= 0x0FF; vce.ctaddress |= (V & 1) << 8; break;
  case 4: vce.color_table[vce.ctaddress & 0x1FF] &= 0x100;
	  vce.color_table[vce.ctaddress & 0x1FF] |= V;
	  FixPCache(vce.ctaddress & 0x1FF);
          break;
  case 5: vce.color_table[vce.ctaddress & 0x1FF] &= 0xFF;
	  vce.color_table[vce.ctaddress & 0x1FF] |= (V & 1) << 8;
	  FixPCache(vce.ctaddress & 0x1FF);
	  vce.ctaddress++;
	  break;
 }
}


bool VDC_ToggleLayer(int which)
{
 userle ^= 1 << which;
 return((userle >> which) & 1);
}

#define REGSETP(_reg, _data, _msb) { _reg &= 0xFF << (_msb ? 0 : 8); _reg |= _data << (_msb ? 8 : 0); }
#define REGGETP(_reg, _msb) ((_reg >> (_msb ? 8 : 0)) & 0xFF)

DECLFR(VDC_Read)
{
 uint8 ret = 0;
 int msb = A & 1;

 if(sgfx)
 {
  A &= 0x1F;
  switch(A)
  {
   case 0x8: return(vpc.priority[0]);
   case 0x9: return(vpc.priority[1]);
   case 0xA: return(vpc.winwidths[0]);
   case 0xB: return(vpc.winwidths[0] >> 8);
   case 0xC: return(vpc.winwidths[1]);
   case 0xD: return(vpc.winwidths[1] >> 8);
   case 0xE: return(0);
  }
  if(A & 0x8) return(0);
  vdc = &vdc_chips[(A & 0x10) >> 4];
  A &= 0x3;
 }
 else 
  A &= 0x3;

 switch(A)
 {
  case 0x0: ret = vdc->status;

	    vdc->status &= ~0x3F;

            if(sgfx)
            {
             if(!(vdc_chips[0].status & 0x3F) && !(vdc_chips[1].status & 0x3F))
	      HuC6280_IRQEnd(MDFN_IQIRQ1);
            }
            else
              HuC6280_IRQEnd(MDFN_IQIRQ1); // Clear VDC IRQ line
	    break;
  case 0x2:
  case 0x3:
	   ret = REGGETP(vdc->read_buffer, msb);
	   if(vdc->select == 0x2) // VRR - VRAM Read Register
	   {
	    if(msb) 
	    {
	     vdc->MARR += vram_inc_tab[(vdc->CR >> 11) & 0x3];
	     vdc->read_buffer = vdc->VRAM[vdc->MARR & 0x7FFF];
	    }
	   }
	   break;
 }

 if(HuCPU.isopread && A == 0x1)
 {
  ret = 0x40;
 }
 return(ret);
}

DECLFW(VDC_Write_ST)
{
 if(sgfx)
  A |= vpc.st_mode ? 0x10 : 0;

 VDC_Write(A, V);
}

static void DoDMA(void)
{
    // Assuming one cycle for reads, one cycle for write, with DMA?
     for(int i = 0; i < 455; i++)
     {
      if(!vdc->DMAReadWrite)
       vdc->DMAReadBuffer = vdc->VRAM[vdc->SOUR & 0x7FFF];
      else
      {
       vdc->VRAM[vdc->DESR & 0x7FFF] = vdc->DMAReadBuffer;
       FixTileCache(vdc->DESR & 0x7FFF);

       vdc->DESR += (((vdc->DCR & 0x4) >> 1) ^ 2) - 1;
       vdc->SOUR += (((vdc->DCR & 0x8) >> 2) ^ 2) - 1;
       vdc->LENR--;
       if(vdc->LENR == 0xFFFF)  // DMA is done.
       {
        vdc->DMARunning = 0;
        if(vdc->DCR & 0x02)
        {
         vdc->status |= VDCS_DV;
         HuC6280_IRQBegin(MDFN_IQIRQ1);
	 //puts("DMA IRQ");
        }
        break;
       }
      }
      vdc->DMAReadWrite ^= 1;
     } // for()
}

DECLFW(VDC_Write)
{
 int msb = A & 1;

 //printf("%04x, %02x\n", A, V);
 //int chip = A & 0x10;
 if(sgfx)
 {
  A &= 0x1F;
  switch(A)
  {
   case 0x8: vpc.priority[0] = V; break;
   case 0x9: vpc.priority[1] = V; break;
   case 0xA: vpc.winwidths[0] &= 0x300; vpc.winwidths[0] |= V; break;
   case 0xB: vpc.winwidths[0] &= 0x0FF; vpc.winwidths[0] |= (V & 3) << 8; break;
   case 0xC: vpc.winwidths[1] &= 0x300; vpc.winwidths[1] |= V; break;
   case 0xD: vpc.winwidths[1] &= 0x0FF; vpc.winwidths[1] |= (V & 3) << 8; break;
   case 0xE: vpc.st_mode = V & 1; break;
  }
  if(A & 0x8) return;

  vdc = &vdc_chips[(A & 0x10) >> 4];
  A &= 0x3;
 }
 else
  A &= 0x3;

 //printf("%04x: %02x\n", A, V);
 switch(A)
 {
  case 0x0: vdc->select = V & 0x1F; break;
  case 0x2:
  case 0x3:
	   switch(vdc->select & 0x1F)
	   {
	    case 0x00: REGSETP(vdc->MAWR, V, msb); break;
	    case 0x01: REGSETP(vdc->MARR, V, msb);
		       if(msb)
			vdc->read_buffer = vdc->VRAM[vdc->MARR];
	               break;
	    case 0x02: if(!msb) vdc->write_latch = V;
		       else
		       {
			if(vdc->MAWR < 0x8000)
			{
 			 vdc->VRAM[vdc->MAWR & 0x7fff] = (V << 8) | vdc->write_latch;
			 FixTileCache(vdc->MAWR & 0x7FFF);
			} 
	                vdc->MAWR += vram_inc_tab[(vdc->CR >> 11) & 0x3];
		       }
		       break;
	    case 0x05: REGSETP(vdc->CR, V, msb); break;
	    case 0x06: REGSETP(vdc->RCR, V, msb); vdc->RCR &= 0x3FF; break;
	    case 0x07: REGSETP(vdc->BXR, V, msb);  /* printf("BXR: %d\n", HuCPU.timestamp); */ break;
	    case 0x08: REGSETP(vdc->BYR, V, msb); 
		       vdc->BG_YOffset = vdc->BYR; // Set it on LSB and MSB writes(only changing on MSB breaks Youkai Douchuuki)
		       //printf("%04x\n", HuCPU.PC);
		       break;
	    case 0x09: REGSETP(vdc->MWR, V, msb); break;
	    case 0x0a: REGSETP(vdc->HSR, V, msb); break;
	    case 0x0b: REGSETP(vdc->HDR, V, msb); break;
	    case 0x0c: REGSETP(vdc->VSR, V, msb); break;
	    case 0x0d: REGSETP(vdc->VDR, V, msb); break;
	    case 0x0e: REGSETP(vdc->VCR, V, msb); break;
	    case 0x0f: REGSETP(vdc->DCR, V, msb); break;
	    case 0x10: REGSETP(vdc->SOUR, V, msb); /*printf("SOUR: %04x\n", vdc->SOUR); */ break;
	    case 0x11: REGSETP(vdc->DESR, V, msb); /*printf("DESR: %04x\n", vdc->DESR); */ break;
	    case 0x12: REGSETP(vdc->LENR, V, msb); /*printf("LENR: %04x, %d\n", vdc->LENR, HuCPU.timestamp); */
		       if(msb)
		       {
			vdc->DMARunning = 1;
		        vdc->DMAReadWrite = 0;
			if(vdc->burst_mode && !(vdc->DCR & 0x02))
			 DoDMA();	// Do one line's worth of DMA transfers
					// because Cosmic Fantasy 4 is evil
					// and uses timed writes to the DMA
					// start register, rather than waiting until
					// the machine says we're done,
					// which would require cycle-accurate VDC emulation...like that's
					// going to happen when I don't even have accurate values
					// for HuC6280 instruction timings. :b
		       }
		       break;
	    case 0x13: REGSETP(vdc->SATB, V, msb); vdc->SATBPending = 1; break;
//	    default: printf("Oops 2: %04x %02x\n", vdc->select, V);break;
	   }
	   break;
 }
}

static const unsigned int bat_width_tab[4] = { 32, 64, 128, 128 };
static const unsigned int bat_width_shift_tab[4] = { 5, 6, 7, 7 };
static const unsigned int bat_height_tab[2] = { 32, 64 };
static const unsigned int ClockModeWidths[3] = { 288, 384, 576 };

static void DrawBG(uint32 *target, int enabled)
{
 unsigned int width = (M_vdc_HDW + 1) * 8;

 if(width > ClockModeWidths[vce.dot_clock])
  width = ClockModeWidths[vce.dot_clock];

 int start = (ClockModeWidths[vce.dot_clock] - width) / 2;
 int end = start + width;
 int bat_width = bat_width_tab[(vdc->MWR >> 4) & 3];
 int bat_width_mask = bat_width - 1;
 int bat_width_shift = bat_width_shift_tab[(vdc->MWR >> 4) & 3];
 int bat_height_mask = bat_height_tab[(vdc->MWR >> 6) & 1] - 1;


 // Pseudo-hack for Asuka 120%'s odd video timings
 if(vce.dot_clock == 1 && M_vdc_HDS == 5 && M_vdc_HDE == 6 && M_vdc_HDW == 43 && M_vdc_HSW == 2)
  start += 8;
 else if(vce.dot_clock == 0 && M_vdc_HDS == 2 && M_vdc_HDE == 3 && M_vdc_HDW == 33 && M_vdc_HSW == 2)
  start += 4;
 //printf("%d %d\n", vdc->BG_XOffset, vdc->BG_YOffset);

 if(!enabled)
 {
  for(int x = start; x < end; x++)
   target[x] = MK_COLOR(0x00,0xFE,0x00) | amask;
  return;
 }

 if(!(vdc->CR & 0x80)) // BG is disabled
 {
  uint32 color;

  if(vdc->CR & 0x40)
   color = vce.color_table_cache[0x100];
  else
   color = vce.color_table_cache[0x000];

  MDFN_FastU32MemsetM8(target, color, ClockModeWidths[vce.dot_clock]);
  for(int x = start; x < end; x++)
   target[x] = color;
  return;
 }

 uint32 overscan_color = vce.color_table_cache[0x100];

 //if(enabled)
 {
  int bat_y = ((vdc->BG_YOffset >> 3) & bat_height_mask) << bat_width_shift;
  int first_end = start + 8 - (vdc->BG_XOffset & 7);

  // Clear the left overscan area
  MDFN_FastU32MemsetM8(target, overscan_color, 50); //(start + 1) &~1);

  for(int x = start; x < first_end; x++)
  {
   int bat_x = (vdc->BG_XOffset >> 3) & bat_width_mask;

   uint16 bat = vdc->VRAM[bat_x | bat_y];
   int palette_index = ((bat >> 12) & 0x0F) << 4;
   uint32 raw_pixel;

   raw_pixel = vdc->bg_tile_cache[bat & 0x7FF][vdc->BG_YOffset & 7][vdc->BG_XOffset & 0x7];
   target[x] = vce.color_table_cache[palette_index | raw_pixel];

   vdc->BG_XOffset++;
  }

  int bat_boom = (vdc->BG_XOffset >> 3) & bat_width_mask;
  int line_sub = vdc->BG_YOffset & 7;
  for(int x = first_end; x < end; x+=8) // This will draw past the right side of the buffer, but since our pitch is 1024, and max width is ~512, we're safe.  Also,
					// any overflow that is on the visible screen are will be hidden by the overscan color code below this code.
  {
   uint16 bat = vdc->VRAM[bat_boom | bat_y];
   uint32 *lut = &vce.color_table_cache[((bat >> 8) & 0xF0)];
   uint8 *pix_lut = vdc->bg_tile_cache[bat & 0x7FF][line_sub];

   #ifdef LSB_FIRST
    #if SIZEOF_LONG == 8
    uint64 doh = *(uint64 *)pix_lut;

    (target + 0)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 1)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 2)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 3)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 4)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 5)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 6)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 7)[x] = lut[doh];
    #else
    uint32 doh = *(uint32 *)pix_lut;
    (target + 0)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 1)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 2)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 3)[x] = lut[doh];
    doh = *(uint32 *)(pix_lut + 4);
    (target + 4)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 5)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 6)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 7)[x] = lut[doh];
    #endif
   #else
   (target + 0)[x] = lut[pix_lut[0]];
   (target + 1)[x] = lut[pix_lut[1]];
   (target + 2)[x] = lut[pix_lut[2]];
   (target + 3)[x] = lut[pix_lut[3]];
   (target + 4)[x] = lut[pix_lut[4]];
   (target + 5)[x] = lut[pix_lut[5]];
   (target + 6)[x] = lut[pix_lut[6]];
   (target + 7)[x] = lut[pix_lut[7]];
   #endif
   bat_boom = (bat_boom + 1) & bat_width_mask;
   vdc->BG_XOffset++;
  }
  MDFN_FastU32MemsetM8(target, overscan_color, (start + 1) & ~1);
  {
   uint32 end_begin = ((end + 1) & ~ 1);
   MDFN_FastU32MemsetM8(target + end_begin, overscan_color, ClockModeWidths[vce.dot_clock] - end_begin);
  }
 }

 //else
 //{
 // uint32 overscan_color = vce.color_table_cache[0x100];
 // MDFN_FastU32MemsetM8(target, overscan_color, (end + 1) & ~1);
 //}
}

static void DrawBG16(uint16 *target, int enabled)
{
 unsigned int width = (M_vdc_HDW + 1) * 8;

 if(width > ClockModeWidths[vce.dot_clock])
  width = ClockModeWidths[vce.dot_clock];

 int start = (ClockModeWidths[vce.dot_clock] - width) / 2;
 int end = start + width;
 int bat_width = bat_width_tab[(vdc->MWR >> 4) & 3];
 int bat_width_mask = bat_width - 1;
 int bat_width_shift = bat_width_shift_tab[(vdc->MWR >> 4) & 3];
 int bat_height_mask = bat_height_tab[(vdc->MWR >> 6) & 1] - 1;


 // Pseudo-hack for Asuka 120%'s odd video timings
 if(vce.dot_clock == 1 && M_vdc_HDS == 5 && M_vdc_HDE == 6 && M_vdc_HDW == 43 && M_vdc_HSW == 2)
  start += 8;
 else if(vce.dot_clock == 0 && M_vdc_HDS == 2 && M_vdc_HDE == 3 && M_vdc_HDW == 33 && M_vdc_HSW == 2)
  start += 4;
 //printf("%d %d\n", vdc->BG_XOffset, vdc->BG_YOffset);

 if(!enabled)
 {
  for(int x = start; x < end; x++)
   target[x] = MK_COLOR16(0x00,0xFE,0x00) | amask;
  return;
 }

 if(!(vdc->CR & 0x80)) // BG is disabled
 {
  uint16 color;

  if(vdc->CR & 0x40)
   color = vce.color_table_cache[0x100];
  else
   color = vce.color_table_cache[0x000];

  MDFN_FastU32MemsetM8_16(target, color, ClockModeWidths[vce.dot_clock]);
  for(int x = start; x < end; x++)
   target[x] = color;
  return;
 }

 uint16 overscan_color = vce.color_table_cache[0x100];

 //if(enabled)
 {
  int bat_y = ((vdc->BG_YOffset >> 3) & bat_height_mask) << bat_width_shift;
  int first_end = start + 8 - (vdc->BG_XOffset & 7);

  // Clear the left overscan area
  MDFN_FastU32MemsetM8_16(target, overscan_color, 50); //(start + 1) &~1);

  for(int x = start; x < first_end; x++)
  {
   int bat_x = (vdc->BG_XOffset >> 3) & bat_width_mask;

   uint16 bat = vdc->VRAM[bat_x | bat_y];
   int palette_index = ((bat >> 12) & 0x0F) << 4;
   uint32 raw_pixel;

   raw_pixel = vdc->bg_tile_cache[bat & 0x7FF][vdc->BG_YOffset & 7][vdc->BG_XOffset & 0x7];
   target[x] = vce.color_table_cache[palette_index | raw_pixel];

   vdc->BG_XOffset++;
  }

  int bat_boom = (vdc->BG_XOffset >> 3) & bat_width_mask;
  int line_sub = vdc->BG_YOffset & 7;
  for(int x = first_end; x < end; x+=8) // This will draw past the right side of the buffer, but since our pitch is 1024, and max width is ~512, we're safe.  Also,
					// any overflow that is on the visible screen are will be hidden by the overscan color code below this code.
  {
   uint16 bat = vdc->VRAM[bat_boom | bat_y];
   uint32 *lut = &vce.color_table_cache[((bat >> 8) & 0xF0)];
   uint8 *pix_lut = vdc->bg_tile_cache[bat & 0x7FF][line_sub];

   #ifdef LSB_FIRST
    #if SIZEOF_LONG == 8
    uint64 doh = *(uint64 *)pix_lut;

    (target + 0)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 1)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 2)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 3)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 4)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 5)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 6)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 7)[x] = lut[doh];
    #else
    uint32 doh = *(uint32 *)pix_lut;
    (target + 0)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 1)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 2)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 3)[x] = lut[doh];
    doh = *(uint32 *)(pix_lut + 4);
    (target + 4)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 5)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 6)[x] = lut[doh & 0xFF];
    doh >>= 8;
    (target + 7)[x] = lut[doh];
    #endif
   #else
   (target + 0)[x] = lut[pix_lut[0]];
   (target + 1)[x] = lut[pix_lut[1]];
   (target + 2)[x] = lut[pix_lut[2]];
   (target + 3)[x] = lut[pix_lut[3]];
   (target + 4)[x] = lut[pix_lut[4]];
   (target + 5)[x] = lut[pix_lut[5]];
   (target + 6)[x] = lut[pix_lut[6]];
   (target + 7)[x] = lut[pix_lut[7]];
   #endif
   bat_boom = (bat_boom + 1) & bat_width_mask;
   vdc->BG_XOffset++;
  }
  MDFN_FastU32MemsetM8_16(target, overscan_color, (start + 1) & ~1);
  {
   uint32 end_begin = ((end + 1) & ~ 1);
   MDFN_FastU32MemsetM8_16(target + end_begin, overscan_color, ClockModeWidths[vce.dot_clock] - end_begin);
  }
 }

 //else
 //{
 // uint32 overscan_color = vce.color_table_cache[0x100];
 // MDFN_FastU32MemsetM8(target, overscan_color, (end + 1) & ~1);
 //}
}

#define SPRF_PRIORITY	0x00080
#define SPRF_HFLIP	0x00800
#define SPRF_VFLIP	0x08000
#define SPRF_SPRITE0	0x10000

static const unsigned int sprite_height_tab[4] = { 16, 32, 64, 64 };
static const unsigned int sprite_height_no_mask[4] = { ~0, ~2, ~6, ~6 };
static const unsigned int sprite_width_tab[2] = { 16, 32 };

typedef struct
{
	uint32 x;
	uint32 flags;
	uint8 palette_index;
	uint16 pattern_data[4];
} SPRLE;

static void DrawSprites(uint32 *target, int enabled)
{
 int active_sprites = 0;
 SPRLE SpriteList[64 * 2]; // (see unlimited_sprites option, *2 to accomodate 32-pixel-width sprites ) //16];
 uint32 sprite_line_buf[1024] ; //__attribute__ ((aligned (16)));

 // First, grab the up to 16 sprites.
 for(int i = 0; i < 64; i++)
 {
  int16 y = (vdc->SAT[i * 4 + 0] & 0x3FF) - 0x40;
  uint16 x = (vdc->SAT[i * 4 + 1] & 0x3FF);
  uint16 no = (vdc->SAT[i * 4 + 2] >> 1) & 0x1FF;	// Todo, cg mode bit
  uint16 flags = (vdc->SAT[i * 4 + 3]);

  uint32 palette_index = (flags & 0xF) << 4;
  uint32 height = sprite_height_tab[(flags >> 12) & 3];
  uint32 width = sprite_width_tab[(flags >> 8) & 1];

  if((int32)vdc->RCRCount >= y && (int32)vdc->RCRCount < (int32)(y + height))
  {
   bool second_half = 0;
   uint32 y_offset = vdc->RCRCount - y;
   if(y_offset > height) continue;

   breepbreep:

   if(active_sprites == 16)
   {
    if(vdc->CR & 0x2)
    {
     vdc->status |= VDCS_OR;
     HuC6280_IRQBegin(MDFN_IQIRQ1);
     //puts("OR IRQ");
    }
    if(!unlimited_sprites)
     break;
   }


   {
    if(flags & SPRF_VFLIP)
     y_offset = height - 1 - y_offset;

    no &= sprite_height_no_mask[(flags >> 12) & 3];
    no |= (y_offset & 0x30) >> 3;
    if(width == 32) no &= ~1;
    if(second_half)
     no |= 1;

    SpriteList[active_sprites].flags = flags;

    if(flags & SPRF_HFLIP && width == 32)
     no ^= 1;
    //printf("Found: %d %d\n", vdc->RCRCount, x);
    SpriteList[active_sprites].x = x;
    SpriteList[active_sprites].palette_index = palette_index;


    if((vdc->MWR & 0xC) == 4)
    {
     if(vdc->SAT[i * 4 + 2] & 1)
     {
      SpriteList[active_sprites].pattern_data[2] = 0; //vdc->VRAM[no * 64 + (y_offset & 15) ];
      SpriteList[active_sprites].pattern_data[3] = 0; //vdc->VRAM[no * 64 + (y_offset & 15) + 16];
      SpriteList[active_sprites].pattern_data[0] = vdc->VRAM[no * 64 + (y_offset & 15) + 32];
      SpriteList[active_sprites].pattern_data[1] = vdc->VRAM[no * 64 + (y_offset & 15) + 48];
     }
     else
     {
      SpriteList[active_sprites].pattern_data[0] = vdc->VRAM[no * 64 + (y_offset & 15) ];
      SpriteList[active_sprites].pattern_data[1] = vdc->VRAM[no * 64 + (y_offset & 15) + 16];
      SpriteList[active_sprites].pattern_data[2] = 0; //vdc->VRAM[no * 64 + (y_offset & 15) + 32];
      SpriteList[active_sprites].pattern_data[3] = 0; //vdc->VRAM[no * 64 + (y_offset & 15) + 48];
     }
    }
    else
    {
	//if(y_offset == 0)
        //printf("%d %d\n", vdc->RCRCount, no * 64);

     SpriteList[active_sprites].pattern_data[0] = vdc->VRAM[no * 64 + (y_offset & 15) ];
     SpriteList[active_sprites].pattern_data[1] = vdc->VRAM[no * 64 + (y_offset & 15) + 16];
     SpriteList[active_sprites].pattern_data[2] = vdc->VRAM[no * 64 + (y_offset & 15) + 32];
     SpriteList[active_sprites].pattern_data[3] = vdc->VRAM[no * 64 + (y_offset & 15) + 48];
    }
    SpriteList[active_sprites].flags |= i ? 0 : SPRF_SPRITE0;

    active_sprites++;

    if(width == 32 && !second_half)
    {
     second_half = 1;
     x += 16;
     y_offset = vdc->RCRCount - y;	// Fix the y offset so that sprites that are hflipped + vflipped display properly
     goto breepbreep;
    }
   }
  }
 }

 uint32 display_width = (M_vdc_HDW + 1) * 8;

 if(display_width > ClockModeWidths[vce.dot_clock])
  display_width = ClockModeWidths[vce.dot_clock];

 uint32 start = (ClockModeWidths[vce.dot_clock] - display_width) / 2;;

 if(vce.dot_clock == 1 && M_vdc_HDS == 5 && M_vdc_HDE == 6 && M_vdc_HDW == 43 && M_vdc_HSW == 2)
  start += 8;
 else if(vce.dot_clock == 0 && M_vdc_HDS == 2 && M_vdc_HDE == 3 && M_vdc_HDW == 33 && M_vdc_HSW == 2)
  start += 4;

 uint32 end = start + display_width;

 if(end > ClockModeWidths[vce.dot_clock])
 {
  end = ClockModeWidths[vce.dot_clock];
 }

 MDFN_FastU32MemsetM8(sprite_line_buf, amask, (end + 1) & ~1);

 for(int i = (active_sprites - 1) ; i >= 0; i--)
 {
  int32 pos = SpriteList[i].x - 0x20 + start;
  uint32 prio_or = 0;

  if(SpriteList[i].flags & SPRF_PRIORITY) 
   prio_or = amask << 1;

  prio_or |= (amask << 2);	// For sprite #0 hit detection

  //printf("%u %u %u %u\n", SpriteList[i].x, start, max_x, display_width);

  if((SpriteList[i].flags & SPRF_SPRITE0) && (vdc->CR & 0x01))
  for(uint32 x = 0; x < 16; x++)
  {
   uint32 raw_pixel;
   uint32 pi = SpriteList[i].palette_index;
   uint32 rev_x = 15 - x;

   if(SpriteList[i].flags & SPRF_HFLIP)
    rev_x = x;

   raw_pixel = (SpriteList[i].pattern_data[0] >> rev_x)  & 1;
   raw_pixel |= ((SpriteList[i].pattern_data[1] >> rev_x) & 1) << 1;
   raw_pixel |= ((SpriteList[i].pattern_data[2] >> rev_x) & 1) << 2;
   raw_pixel |= ((SpriteList[i].pattern_data[3] >> rev_x) & 1) << 3;

   if(raw_pixel)
   {
    pi |= 0x100;
    uint32 tx = pos + x;

    if(tx >= end) // Covers negative and overflowing the right side.
     continue;

    if(sprite_line_buf[tx] & (amask << 2))
    {
     vdc->status |= VDCS_CR;
     //puts("CR IRQ");
     HuC6280_IRQBegin(MDFN_IQIRQ1);
    }
    sprite_line_buf[tx] = vce.color_table_cache[pi | raw_pixel] | prio_or;
   }
  }
  else
  for(uint32 x = 0; x < 16; x++)
  {
   uint32 raw_pixel;
   uint32 pi = SpriteList[i].palette_index;
   uint32 rev_x = 15 - x;

   if(SpriteList[i].flags & SPRF_HFLIP)
    rev_x = x;

   raw_pixel = (SpriteList[i].pattern_data[0] >> rev_x)  & 1;
   raw_pixel |= ((SpriteList[i].pattern_data[1] >> rev_x) & 1) << 1;
   raw_pixel |= ((SpriteList[i].pattern_data[2] >> rev_x) & 1) << 2;
   raw_pixel |= ((SpriteList[i].pattern_data[3] >> rev_x) & 1) << 3;

   if(raw_pixel)
   {
    pi |= 0x100;
    uint32 tx = pos + x;
    if(tx >= end) // Covers negative and overflowing the right side.
     continue;
    sprite_line_buf[tx] = vce.color_table_cache[pi | raw_pixel] | prio_or;
   }
  }
 }

 if(enabled)
  for(unsigned int x = start; x < end; x++)
  {
   if(!(sprite_line_buf[x] & amask))
   {
    if((target[x] & amask) || (sprite_line_buf[x] & (amask << 1)))
     target[x] = sprite_line_buf[x];
   }
  }
}


static void DrawSprites16(uint16 *target, int enabled)
{
 int active_sprites = 0;
 SPRLE SpriteList[64 * 2]; // (see unlimited_sprites option, *2 to accomodate 32-pixel-width sprites ) //16];
 uint32 sprite_line_buf[1024] ; //__attribute__ ((aligned (16)));

 // First, grab the up to 16 sprites.
 for(int i = 0; i < 64; i++)
 {
  int16 y = (vdc->SAT[i * 4 + 0] & 0x3FF) - 0x40;
  uint16 x = (vdc->SAT[i * 4 + 1] & 0x3FF);
  uint16 no = (vdc->SAT[i * 4 + 2] >> 1) & 0x1FF;	// Todo, cg mode bit
  uint16 flags = (vdc->SAT[i * 4 + 3]);

  uint32 palette_index = (flags & 0xF) << 4;
  uint32 height = sprite_height_tab[(flags >> 12) & 3];
  uint32 width = sprite_width_tab[(flags >> 8) & 1];

  if((int32)vdc->RCRCount >= y && (int32)vdc->RCRCount < (int32)(y + height))
  {
   bool second_half = 0;
   uint32 y_offset = vdc->RCRCount - y;
   if(y_offset > height) continue;

   breepbreep:

   if(active_sprites == 16)
   {
    if(vdc->CR & 0x2)
    {
     vdc->status |= VDCS_OR;
     HuC6280_IRQBegin(MDFN_IQIRQ1);
     //puts("OR IRQ");
    }
    if(!unlimited_sprites)
     break;
   }


   {
    if(flags & SPRF_VFLIP)
     y_offset = height - 1 - y_offset;

    no &= sprite_height_no_mask[(flags >> 12) & 3];
    no |= (y_offset & 0x30) >> 3;
    if(width == 32) no &= ~1;
    if(second_half)
     no |= 1;

    SpriteList[active_sprites].flags = flags;

    if(flags & SPRF_HFLIP && width == 32)
     no ^= 1;
    //printf("Found: %d %d\n", vdc->RCRCount, x);
    SpriteList[active_sprites].x = x;
    SpriteList[active_sprites].palette_index = palette_index;


    if((vdc->MWR & 0xC) == 4)
    {
     if(vdc->SAT[i * 4 + 2] & 1)
     {
      SpriteList[active_sprites].pattern_data[2] = 0; //vdc->VRAM[no * 64 + (y_offset & 15) ];
      SpriteList[active_sprites].pattern_data[3] = 0; //vdc->VRAM[no * 64 + (y_offset & 15) + 16];
      SpriteList[active_sprites].pattern_data[0] = vdc->VRAM[no * 64 + (y_offset & 15) + 32];
      SpriteList[active_sprites].pattern_data[1] = vdc->VRAM[no * 64 + (y_offset & 15) + 48];
     }
     else
     {
      SpriteList[active_sprites].pattern_data[0] = vdc->VRAM[no * 64 + (y_offset & 15) ];
      SpriteList[active_sprites].pattern_data[1] = vdc->VRAM[no * 64 + (y_offset & 15) + 16];
      SpriteList[active_sprites].pattern_data[2] = 0; //vdc->VRAM[no * 64 + (y_offset & 15) + 32];
      SpriteList[active_sprites].pattern_data[3] = 0; //vdc->VRAM[no * 64 + (y_offset & 15) + 48];
     }
    }
    else
    {
	//if(y_offset == 0)
        //printf("%d %d\n", vdc->RCRCount, no * 64);

     SpriteList[active_sprites].pattern_data[0] = vdc->VRAM[no * 64 + (y_offset & 15) ];
     SpriteList[active_sprites].pattern_data[1] = vdc->VRAM[no * 64 + (y_offset & 15) + 16];
     SpriteList[active_sprites].pattern_data[2] = vdc->VRAM[no * 64 + (y_offset & 15) + 32];
     SpriteList[active_sprites].pattern_data[3] = vdc->VRAM[no * 64 + (y_offset & 15) + 48];
    }
    SpriteList[active_sprites].flags |= i ? 0 : SPRF_SPRITE0;

    active_sprites++;

    if(width == 32 && !second_half)
    {
     second_half = 1;
     x += 16;
     y_offset = vdc->RCRCount - y;	// Fix the y offset so that sprites that are hflipped + vflipped display properly
     goto breepbreep;
    }
   }
  }
 }

 uint32 display_width = (M_vdc_HDW + 1) * 8;

 if(display_width > ClockModeWidths[vce.dot_clock])
  display_width = ClockModeWidths[vce.dot_clock];

 uint32 start = (ClockModeWidths[vce.dot_clock] - display_width) / 2;;

 if(vce.dot_clock == 1 && M_vdc_HDS == 5 && M_vdc_HDE == 6 && M_vdc_HDW == 43 && M_vdc_HSW == 2)
  start += 8;
 else if(vce.dot_clock == 0 && M_vdc_HDS == 2 && M_vdc_HDE == 3 && M_vdc_HDW == 33 && M_vdc_HSW == 2)
  start += 4;

 uint32 end = start + display_width;

 if(end > ClockModeWidths[vce.dot_clock])
 {
  end = ClockModeWidths[vce.dot_clock];
 }

 MDFN_FastU32MemsetM8(sprite_line_buf, amask, (end + 1) & ~1);

 for(int i = (active_sprites - 1) ; i >= 0; i--)
 {
  int32 pos = SpriteList[i].x - 0x20 + start;
  uint32 prio_or = 0;

  if(SpriteList[i].flags & SPRF_PRIORITY) 
   prio_or = amask << 1;

  prio_or |= (amask << 2);	// For sprite #0 hit detection

  //printf("%u %u %u %u\n", SpriteList[i].x, start, max_x, display_width);

  if((SpriteList[i].flags & SPRF_SPRITE0) && (vdc->CR & 0x01))
  for(uint32 x = 0; x < 16; x++)
  {
   uint32 raw_pixel;
   uint32 pi = SpriteList[i].palette_index;
   uint32 rev_x = 15 - x;

   if(SpriteList[i].flags & SPRF_HFLIP)
    rev_x = x;

   raw_pixel = (SpriteList[i].pattern_data[0] >> rev_x)  & 1;
   raw_pixel |= ((SpriteList[i].pattern_data[1] >> rev_x) & 1) << 1;
   raw_pixel |= ((SpriteList[i].pattern_data[2] >> rev_x) & 1) << 2;
   raw_pixel |= ((SpriteList[i].pattern_data[3] >> rev_x) & 1) << 3;

   if(raw_pixel)
   {
    pi |= 0x100;
    uint32 tx = pos + x;

    if(tx >= end) // Covers negative and overflowing the right side.
     continue;

    if(sprite_line_buf[tx] & (amask << 2))
    {
     vdc->status |= VDCS_CR;
     //puts("CR IRQ");
     HuC6280_IRQBegin(MDFN_IQIRQ1);
    }
    sprite_line_buf[tx] = vce.color_table_cache[pi | raw_pixel] | prio_or;
   }
  }
  else
  for(uint32 x = 0; x < 16; x++)
  {
   uint32 raw_pixel;
   uint32 pi = SpriteList[i].palette_index;
   uint32 rev_x = 15 - x;

   if(SpriteList[i].flags & SPRF_HFLIP)
    rev_x = x;

   raw_pixel = (SpriteList[i].pattern_data[0] >> rev_x)  & 1;
   raw_pixel |= ((SpriteList[i].pattern_data[1] >> rev_x) & 1) << 1;
   raw_pixel |= ((SpriteList[i].pattern_data[2] >> rev_x) & 1) << 2;
   raw_pixel |= ((SpriteList[i].pattern_data[3] >> rev_x) & 1) << 3;

   if(raw_pixel)
   {
    pi |= 0x100;
    uint32 tx = pos + x;
    if(tx >= end) // Covers negative and overflowing the right side.
     continue;
    sprite_line_buf[tx] = vce.color_table_cache[pi | raw_pixel] | prio_or;
   }
  }
 }

 if(enabled)
  for(unsigned int x = start; x < end; x++)
  {
   if(!(sprite_line_buf[x] & amask))
   {
    if((target[x] & amask) || (sprite_line_buf[x] & (amask << 1)))
     target[x] = sprite_line_buf[x];
   }
  }
}

int vdc_leadin_hack = 0;

extern MDFNGI EmulatedPCE;
void VDC_RunFrame(uint32 *pXBuf, MDFN_Rect *LineWidths, int skip)
{
 vdc = &vdc_chips[0];

 unsigned int VDS = M_vdc_VDS;
 unsigned int VSW = M_vdc_VSW;
 unsigned int VDW = M_vdc_VDW;
 unsigned int VCR = M_vdc_VCR;

 EmulatedPCE.DisplayRect.y = 4;
 EmulatedPCE.DisplayRect.h = 232;

 int total_chips = 1;
 if(sgfx)
  total_chips = 2;

 int max_dc = 0;
 int frame_counter;

//#define M_vdc_HSW       (vdc->HSR & 0x1F)       // Horizontal Synchro Width
//#define M_vdc_HDS       ((vdc->HSR >> 8) & 0x7F) // Horizontal Display Start
//#define M_vdc_HDW       (vdc->HDR & 0x7F)       // Horizontal Display Width
//#define M_vdc_HDE       ((vdc->HDR >> 8) & 0x7F) // Horizontal Display End

 //printf("%d %d %d %d %d\n", M_vdc_HDS, M_vdc_HDE, M_vdc_HDW, M_vdc_HSW, vce.dot_clock);
// printf("%d\n", vce.lc263);
 for(frame_counter = 0; frame_counter < 263; ) //(vce.lc263 ? 263 : 262); )
 {
  static const unsigned int ClockPixelWidths[3] = { 341, 455, 682 };
  uint32 line_buffer[2][1024];	// For super grafx emulation
  int need_vbi = 0;
  int line_leadin1 = 0;

  int magical = M_vdc_HDS + (M_vdc_HDW + 1) + M_vdc_HDE;
  magical = (magical + 2) & ~1;
  magical -= M_vdc_HDW + 1;
  int cyc_tot = magical * 8; //ClockPixelWidths[vce.dot_clock] - magical * 8;
  cyc_tot-=2;
  switch(vce.dot_clock)
  {
   case 0: cyc_tot = 4 * cyc_tot / 3; break;
   case 1: break;
   case 2: cyc_tot = 2 * cyc_tot / 3; break;
  }

  if(cyc_tot < 0) cyc_tot = 0;
  line_leadin1 = cyc_tot;

//  if(frame_counter == 0)
//   printf("%d %d %d %d %d %d\n", M_vdc_HDS, M_vdc_HDE, M_vdc_HDW, M_vdc_HSW, vce.dot_clock, cyc_tot);

//   printf("%d %d\n", M_vdc_HSW, line_leadin1);
  EmulatedPCE.DisplayRect.w = ClockModeWidths[vce.dot_clock];
  if(max_dc < vce.dot_clock)
   max_dc = vce.dot_clock;

  int os_hide;

  switch(vce.dot_clock)
  {
   default:
   case 0: os_hide = 16; break;
   case 1: os_hide = 21; break;
   case 2: os_hide = 32; break;
  }

  EmulatedPCE.DisplayRect.x = os_hide;
  EmulatedPCE.DisplayRect.w -= os_hide * 2;

  //printf("Line: %d %d\n", frame_counter, HuCPU.timestamp);
  for(int chip = 0; chip < total_chips; chip++)
  {
   vdc = &vdc_chips[chip];
   if(frame_counter == 0)
   {
    vdc->display_counter = 0;
    vdc->burst_mode = !(vdc->CR & 0xC0);
   }
   int have_free_time = 1;
   if(vdc->burst_mode)
   {
    if(vdc->display_counter == (VDS + VSW))
     vdc->RCRCount = 0;
   }
   else if(vdc->display_counter >= (VDS + VSW) && vdc->display_counter < (VDS + VSW + VDW + 1))
   {
    have_free_time = 0;
    if(vdc->display_counter == (VDS + VSW))
     vdc->RCRCount = 0;
   }
   else	// Hmm, overscan...
   {

   }
   if(have_free_time) // We're outside of the active display area.  Weehee
   {
    if(vdc->DMARunning)
     DoDMA();
   }

   if(vdc->display_counter == (VDS + VSW + VDW + 1) || ((VDS + VSW + VDW + 1) > 261 && vdc->display_counter == 261) ) // VBlank interrupt.  Doom doom.
   {
    need_vbi = 1;
    if(vdc->SATBPending || (vdc->DCR & 0x10))
    {
     vdc->SATBPending = 0;
     vdc->sat_dma_slcounter = 2;

     if(vdc->SATB < 0x8000)
     {
      uint32 len = 256;
      if(vdc->SATB > 0x7F00)
       len = 0x8000 - vdc->SATB;
      memcpy(vdc->SAT, &vdc->VRAM[vdc->SATB], len * sizeof(uint16));
     }
    }
   }
   if((int)vdc->RCRCount == ((int)vdc->RCR - 0x40) && (vdc->CR & 0x04))
   {
    //printf("RCR Interrupt: %d\n", vdc->display_counter);
    vdc->status |= VDCS_RR;
    HuC6280_IRQBegin(MDFN_IQIRQ1); 
   }
  }

  HuC6280_Run(line_leadin1);

  for(int chip = 0; chip < total_chips; chip++)
  {
   uint32 *target_ptr;
   vdc = &vdc_chips[chip];

   if(sgfx)
    target_ptr = line_buffer[chip];
   else
    target_ptr = pXBuf + (frame_counter - 14) * 1024;

   if(frame_counter >= 14 && frame_counter < (14 + 242))
    LineWidths[frame_counter - 14] = EmulatedPCE.DisplayRect;

   if(vdc->burst_mode)
   {
    if(frame_counter >= 14 && frame_counter < (14 + 242))
    {
     uint32 color;

     if(!skip)
     {
      if(vce.bw)
       color = bw_systemColorMap32[vce.color_table[0x100]] | amask;
      else
       color = systemColorMap32[vce.color_table[0x100]] | amask;
      MDFN_FastU32MemsetM8(target_ptr, color, 576); //512);
     }
    }
   }
   else if(vdc->display_counter >= (VDS + VSW) && vdc->display_counter < (VDS + VSW + VDW + 1))
   {
    if(vdc->display_counter == (VDS + VSW))
     vdc->BG_YOffset = vdc->BYR;
    else
     vdc->BG_YOffset++;
    vdc->BG_XOffset = vdc->BXR;
    if(frame_counter >= 14 && frame_counter < (14 + 242))
    {
     if(!skip)
      DrawBG(target_ptr, userle & (chip ? ULE_BG1 : ULE_BG0));
     if(vdc->CR & 0x40)
      DrawSprites(target_ptr, skip ? 0 : (userle & (chip ? ULE_SPR1 : ULE_SPR0)));
    }
    // Draw screen, joy.
   }
   else // Hmm, overscan...
   {
    if(frame_counter >= 14 && frame_counter < (14 + 242))
    {
     uint32 color;
     if(vce.bw)
      color = bw_systemColorMap32[vce.color_table[0x100]] | amask;
     else
      color = systemColorMap32[vce.color_table[0x100]] | amask;
     MDFN_FastU32MemsetM8(target_ptr, color, 576); //512);
    }
   }
  }

  vdc = &vdc_chips[0];
  if((vdc->CR & 0x08) && need_vbi)
  {
   int tc = 2; //0 - HuCPU.count + 1;
   vdc->status |= VDCS_VD;
   HuC6280_Run(tc);
   if(vdc->status & VDCS_VD)
   {
    HuC6280_IRQBegin(MDFN_IQIRQ1);
   }
   HuC6280_Run(455 - line_leadin1 - tc);
  }
  else
   HuC6280_Run(455 - line_leadin1);

  for(int chip = 0; chip < total_chips; chip++)
  {
   vdc = &vdc_chips[chip];
   vdc->RCRCount++;

   //vdc->BG_YOffset = (vdc->BG_YOffset + 1);
   vdc->display_counter++;

   if(vdc->sat_dma_slcounter)
   {
    vdc->sat_dma_slcounter--;
    if(!vdc->sat_dma_slcounter)
    {
     if(vdc->DCR & 0x01)
     {
      vdc->status |= VDCS_DS;
      HuC6280_IRQBegin(MDFN_IQIRQ1);
     }
    }
   }

   if(vdc->display_counter == (VDS + VSW + VDW + VCR + 3))
   {
    vdc->display_counter = 0;
   }
  }

  if(sgfx && !skip)
  if(frame_counter >= 14 && frame_counter < (14 + 242))
  {
   int start = EmulatedPCE.DisplayRect.x;
   if(vce.dot_clock == 1 && M_vdc_HDS == 5 && M_vdc_HDE == 6 && M_vdc_HDW == 43 && M_vdc_HSW == 2)
    start += 8;
   else if(vce.dot_clock == 0 && M_vdc_HDS == 2 && M_vdc_HDE == 3 && M_vdc_HDW == 33 && M_vdc_HSW == 2)
    start += 4;

   int end = start + EmulatedPCE.DisplayRect.w;

   uint32 *main_target = pXBuf + (frame_counter - 14) * 1024;

   static const int prio_select[4] = { 1, 1, 0, 0 };
   static const int prio_shift[4] = { 4, 0, 4, 0 };

   for(int x = start; x < end; x++)
   {
    int in_window = 0;
    if(x >= start && x < (start + vpc.winwidths[0] - 0x40))
     in_window |= 1;
    if(x >= start && x < (start + vpc.winwidths[1] - 0x40))
     in_window |= 2;

    uint8 pb = (vpc.priority[prio_select[in_window]] >> prio_shift[in_window]) & 0xF;

    uint32 vdc2_pixel, vdc1_pixel;

    vdc2_pixel = vdc1_pixel = vce.color_table_cache[0];

    if(pb & 1)
     vdc1_pixel = line_buffer[0][x];
    if(pb & 2)
     vdc2_pixel = line_buffer[1][x];

/* Dai MakaiMura uses setting 1, and expects VDC #2 sprites in front of VDC #1 background, but
   behind VDC #1's sprites.
 */
    switch(pb >> 2)
    {
     case 1:
		if((vdc2_pixel & (amask << 2)) && !(vdc1_pixel & (amask << 2)))
			vdc1_pixel |= amask;				
		break;
     case 2:
		if((vdc1_pixel & (amask << 2)) && !(vdc2_pixel & (amask << 2)) && !(vdc2_pixel & amask))
			vdc1_pixel |= amask;
		break;
    }
    main_target[x] = (vdc1_pixel & amask) ? vdc2_pixel : vdc1_pixel;
   }
  }
  if(PCE_IsCD)
  {
   ADPCM_Update();
  }
  frame_counter++;
 } // big frame loop!

  //printf("%d\n", max_dc);
  EmulatedPCE.DisplayRect.w = ClockModeWidths[max_dc];
  int os_hide = 0;

  switch(max_dc)
  {
   default:
   case 0: os_hide = 16; break;
   case 1: os_hide = 21; break;
   case 2: os_hide = 32; break;
  }


  EmulatedPCE.DisplayRect.x = os_hide;
  EmulatedPCE.DisplayRect.w -= os_hide * 2;

  //printf("%d %d %d %d %d\n", M_vdc_HDS, M_vdc_HDE, M_vdc_HDW, M_vdc_HSW, vce.dot_clock);
}

void VDC_Reset(void)
{
 vdc_chips[0].read_buffer = vdc_chips[1].read_buffer = 0xFFFF;
 vpc.priority[0] = vpc.priority[1] = 0x11;
 vdc_chips[0].HSR = vdc_chips[0].HDR = vdc_chips[0].VSR = vdc_chips[0].VDR = vdc_chips[0].VCR = 0xFF; // Needed for Body Conquest 2 -_-
}

void VDC_Power(void)
{
 memset(vdc_chips, 0, sizeof(vdc_chips));
 VDC_Reset();
}

void VDC_Init(int sgx)
{
 unlimited_sprites = MDFN_GetSettingB("pce.nospritelimit");
 userle = ~0;
 sgfx = sgx;
}

int VDC_StateAction(StateMem *sm, int load, int data_only)
{
 SFORMAT VCE_StateRegs[] =
 {
  SFVARN(vce.lc263, "lc263"),
  SFVARN(vce.bw, "bw"),
  SFVARN(vce.dot_clock, "dot clock"),
  SFVARN(vce.ctaddress, "ctaddress"),
  SFARRAY16N(vce.color_table, 0x200, "color_table"),
  SFEND
 };


 int ret = MDFNSS_StateAction(sm, load, data_only, VCE_StateRegs, "VCE");

 int max_chips = 1;
 if(sgfx) max_chips = 2;

 if(sgfx)
 {
  SFORMAT VPC_StateRegs[] =
  {
   SFVARN(vpc.st_mode, "st_mode"),
   SFARRAYN(vpc.priority, 2, "priority"),
   SFARRAY16N(vpc.winwidths, 2, "winwidths"),
   SFEND
  };
  ret &= MDFNSS_StateAction(sm, load, data_only, VPC_StateRegs, "VPC");
 }

 for(int chip = 0; chip < max_chips; chip++)
 {
  vdc = &vdc_chips[chip];
  SFORMAT VDC_StateRegs[] = 
  {
	SFVARN(vdc->display_counter, "display_counter"),
        SFVARN(vdc->sat_dma_slcounter, "sat_dma_slcounter"),

	SFVARN(vdc->sgx_priority, "sgx_priority"),
        SFVARN(vdc->select, "select"),
        SFVARN(vdc->MAWR, "MAWR"),
        SFVARN(vdc->MARR, "MARR"),
        SFVARN(vdc->CR, "CR"),
        SFVARN(vdc->RCR, "RCR"),
        SFVARN(vdc->BXR, "BXR"),
        SFVARN(vdc->BYR, "BYR"),
        SFVARN(vdc->MWR, "MWR"),

        SFVARN(vdc->HSR, "HSR"),
        SFVARN(vdc->HDR, "HDR"),
        SFVARN(vdc->VSR, "VSR"),
        SFVARN(vdc->VDR, "VDR"),

        SFVARN(vdc->VCR, "VCR"),
        SFVARN(vdc->DCR, "DCR"),
        SFVARN(vdc->SOUR, "SOUR"),
        SFVARN(vdc->DESR, "DESR"),
        SFVARN(vdc->LENR, "LENR"),
        SFVARN(vdc->SATB, "SATB"),

        SFVARN(vdc->RCRCount, "RCRCount"),

        SFVARN(vdc->read_buffer, "read_buffer"),
        SFVARN(vdc->write_latch, "write_latch"),
        SFVARN(vdc->status, "status"),
        SFARRAY16N(vdc->SAT, 0x100, "SAT"),

        SFARRAY16N(vdc->VRAM, 32768, "VRAM"),
        SFVARN(vdc->DMAReadBuffer, "DMAReadBuffer"),
        SFVARN(vdc->DMAReadWrite, "DMAReadWrite"),
        SFVARN(vdc->DMARunning, "DMARunning"),
        SFVARN(vdc->SATBPending, "SATBPending"),
        SFVARN(vdc->burst_mode, "burst_mode"),

        SFVARN(vdc->BG_YOffset, "BG_YOffset"),
        SFVARN(vdc->BG_XOffset, "BG_XOffset"),
	SFEND
  };
  ret &= MDFNSS_StateAction(sm, load, data_only, VDC_StateRegs, chip ? "VDC1" : "VDC0");
  if(load)
  {
   for(int x = 0; x < 32768; x++)
    FixTileCache(x);
   for(int x = 0; x < 512; x++)
    FixPCache(x);
  }

 }

 return(ret);
}

