PCem

changeset 104:41134e597e51

S3D optimisations - between 15% and 30% faster on my machine.
author TomW
date Wed Jun 04 19:45:12 2014 +0100
parents 354491040ce1
children eb624a751863
files src/ibm.h src/pc.rc src/vid_s3_virge.c src/win.c
diffstat 4 files changed, 341 insertions(+), 174 deletions(-) [+]
line diff
     1.1 --- a/src/ibm.h	Mon May 26 18:13:23 2014 +0100
     1.2 +++ b/src/ibm.h	Wed Jun 04 19:45:12 2014 +0100
     1.3 @@ -455,3 +455,6 @@
     1.4  
     1.5  
     1.6  uint8_t *vramp;
     1.7 +
     1.8 +uint64_t timer_read();
     1.9 +extern uint64_t timer_freq;
     2.1 --- a/src/pc.rc	Mon May 26 18:13:23 2014 +0100
     2.2 +++ b/src/pc.rc	Wed Jun 04 19:45:12 2014 +0100
     2.3 @@ -155,7 +155,7 @@
     2.4      LTEXT           "", IDC_TEXT1, 7, 38, 136, 12
     2.5  END
     2.6  
     2.7 -StatusDlg DIALOGEX 0,0,186,186
     2.8 +StatusDlg DIALOGEX 0,0,186,186+20
     2.9  STYLE DS_SETFONT | DS_MODALFRAME | DS_FIXEDSYS | WS_POPUP | WS_CAPTION | WS_SYSMENU
    2.10  CAPTION "Status"
    2.11  FONT 8, "MS Sans Serif"
     3.1 --- a/src/vid_s3_virge.c	Mon May 26 18:13:23 2014 +0100
     3.2 +++ b/src/vid_s3_virge.c	Wed Jun 04 19:45:12 2014 +0100
     3.3 @@ -11,6 +11,10 @@
     3.4  #include "vid_svga.h"
     3.5  #include "vid_svga_render.h"
     3.6  
     3.7 +static uint64_t virge_time = 0;
     3.8 +static uint64_t status_time = 0;
     3.9 +static int reg_writes = 0;
    3.10 +
    3.11  typedef struct virge_t
    3.12  {
    3.13          mem_mapping_t   linear_mapping;
    3.14 @@ -559,6 +563,7 @@
    3.15  
    3.16  static uint8_t s3_virge_mmio_read(uint32_t addr, void *p)
    3.17  {
    3.18 +        reg_writes++;
    3.19  //        pclog("New MMIO readb %08X\n", addr);
    3.20          switch (addr & 0xffff)
    3.21          {
    3.22 @@ -580,6 +585,7 @@
    3.23  }
    3.24  static uint16_t s3_virge_mmio_read_w(uint32_t addr, void *p)
    3.25  {
    3.26 +        reg_writes++;
    3.27  //        pclog("New MMIO readw %08X\n", addr);
    3.28          switch (addr & 0xfffe)
    3.29          {
    3.30 @@ -592,6 +598,7 @@
    3.31  {
    3.32          virge_t *virge = (virge_t *)p;
    3.33          uint32_t ret = 0xffffffff;
    3.34 +        reg_writes++;
    3.35  //        pclog("New MMIO readl %08X %04X(%08X):%08X  ", addr, CS, cs, pc);
    3.36          switch (addr & 0xfffc)
    3.37          {
    3.38 @@ -723,7 +730,7 @@
    3.39          svga_t *svga = &virge->svga;
    3.40          
    3.41  //        pclog("New MMIO writeb %08X %02X %04x(%08x):%08x\n", addr, val, CS, cs, pc);
    3.42 -       
    3.43 +        reg_writes++;       
    3.44          if ((addr & 0xfffc) < 0x8000)
    3.45                  s3_virge_bitblt(virge, 8, val);
    3.46          else switch (addr & 0xffff)
    3.47 @@ -749,6 +756,7 @@
    3.48  static void s3_virge_mmio_write_w(uint32_t addr, uint16_t val, void *p)
    3.49  {
    3.50          virge_t *virge = (virge_t *)p;
    3.51 +        reg_writes++;
    3.52  //        pclog("New MMIO writew %08X %04X %04x(%08x):%08x\n", addr, val, CS, cs, pc);
    3.53          if ((addr & 0xfffc) < 0x8000)
    3.54          {
    3.55 @@ -769,6 +777,7 @@
    3.56  {
    3.57          virge_t *virge = (virge_t *)p;
    3.58          svga_t *svga = &virge->svga;
    3.59 +        reg_writes++;
    3.60  //        if ((addr & 0xfffc) >= 0x8000)
    3.61  //                pclog("New MMIO writel %08X %08X %04x(%08x):%08x\n", addr, val, CS, cs, pc);
    3.62  
    3.63 @@ -1533,6 +1542,11 @@
    3.64  
    3.65  #define RGB24(r, g, b) ((b) | ((g) << 8) | ((r) << 16))
    3.66  
    3.67 +typedef struct rgba_t
    3.68 +{
    3.69 +        int r, g, b, a;
    3.70 +} rgba_t;
    3.71 +
    3.72  typedef struct s3d_state_t
    3.73  {
    3.74          int32_t r, g, b, a, u, v, d, w;
    3.75 @@ -1552,6 +1566,8 @@
    3.76          
    3.77          int32_t x1, x2;
    3.78          int y;
    3.79 +        
    3.80 +        rgba_t dest_rgba;
    3.81  } s3d_state_t;
    3.82  
    3.83  typedef struct s3d_texture_state_t
    3.84 @@ -1562,62 +1578,96 @@
    3.85          int32_t u, v;
    3.86  } s3d_texture_state_t;
    3.87  
    3.88 -static void (*tex_read)(s3d_state_t *state, s3d_texture_state_t *texture_state, int *r_out, int *g_out, int *b_out, int *a_out);
    3.89 -static void (*tex_sample)(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out);
    3.90 -static void (*dest_pixel)(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out);
    3.91 +static void (*tex_read)(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out);
    3.92 +static void (*tex_sample)(s3d_state_t *state);
    3.93 +static void (*dest_pixel)(s3d_state_t *state);
    3.94  
    3.95  #define MAX(a, b) ((a) > (b) ? (a) : (b))
    3.96  #define MIN(a, b) ((a) < (b) ? (a) : (b))
    3.97  
    3.98  static int _x, _y;
    3.99  
   3.100 -static void tex_ARGB1555(s3d_state_t *state, s3d_texture_state_t *texture_state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.101 +static void tex_ARGB1555(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out)
   3.102  {
   3.103          int offset = ((texture_state->u & 0x7fc0000) >> texture_state->texture_shift) +
   3.104                       (((texture_state->v & 0x7fc0000) >> texture_state->texture_shift) << texture_state->level);
   3.105          uint16_t val = state->texture[texture_state->level][offset];
   3.106  
   3.107 -        if (((texture_state->u | texture_state->v) & 0xf8000000) == 0xf8000000 && !(state->cmd_set & CMD_SET_TWE))
   3.108 -                val = state->tex_bdr_clr;
   3.109 -
   3.110 -        *r_out = ((val & 0x7c00) >> 7) | ((val & 0x7000) >> 12);
   3.111 -        *g_out = ((val & 0x03e0) >> 2) | ((val & 0x0380) >> 7);
   3.112 -        *b_out = ((val & 0x001f) << 3) | ((val & 0x001c) >> 2);
   3.113 -        *a_out = (val & 0x8000) ? 0xff : 0;
   3.114 +        out->r = ((val & 0x7c00) >> 7) | ((val & 0x7000) >> 12);
   3.115 +        out->g = ((val & 0x03e0) >> 2) | ((val & 0x0380) >> 7);
   3.116 +        out->b = ((val & 0x001f) << 3) | ((val & 0x001c) >> 2);
   3.117 +        out->a = (val & 0x8000) ? 0xff : 0;
   3.118  }
   3.119  
   3.120 -static void tex_ARGB4444(s3d_state_t *state, s3d_texture_state_t *texture_state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.121 +static void tex_ARGB1555_nowrap(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out)
   3.122  {
   3.123          int offset = ((texture_state->u & 0x7fc0000) >> texture_state->texture_shift) +
   3.124                       (((texture_state->v & 0x7fc0000) >> texture_state->texture_shift) << texture_state->level);
   3.125          uint16_t val = state->texture[texture_state->level][offset];
   3.126  
   3.127 -        if (((texture_state->u | texture_state->v) & 0xf8000000) == 0xf8000000 && !(state->cmd_set & CMD_SET_TWE))
   3.128 +        if (((texture_state->u | texture_state->v) & 0xf8000000) == 0xf8000000)
   3.129                  val = state->tex_bdr_clr;
   3.130  
   3.131 -        *r_out = ((val & 0x0f00) >> 4) | ((val & 0x0f00) >> 8);
   3.132 -        *g_out = (val & 0x00f0) | ((val & 0x00f0) >> 4);
   3.133 -        *b_out = ((val & 0x000f) << 4) | (val & 0x000f);
   3.134 -        *a_out = ((val & 0xf000) >> 8) | ((val & 0xf000) >> 12);
   3.135 -
   3.136 +        out->r = ((val & 0x7c00) >> 7) | ((val & 0x7000) >> 12);
   3.137 +        out->g = ((val & 0x03e0) >> 2) | ((val & 0x0380) >> 7);
   3.138 +        out->b = ((val & 0x001f) << 3) | ((val & 0x001c) >> 2);
   3.139 +        out->a = (val & 0x8000) ? 0xff : 0;
   3.140  }
   3.141  
   3.142 -static void tex_ARGB8888(s3d_state_t *state, s3d_texture_state_t *texture_state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.143 +static void tex_ARGB4444(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out)
   3.144 +{
   3.145 +        int offset = ((texture_state->u & 0x7fc0000) >> texture_state->texture_shift) +
   3.146 +                     (((texture_state->v & 0x7fc0000) >> texture_state->texture_shift) << texture_state->level);
   3.147 +        uint16_t val = state->texture[texture_state->level][offset];
   3.148 +
   3.149 +        out->r = ((val & 0x0f00) >> 4) | ((val & 0x0f00) >> 8);
   3.150 +        out->g = (val & 0x00f0) | ((val & 0x00f0) >> 4);
   3.151 +        out->b = ((val & 0x000f) << 4) | (val & 0x000f);
   3.152 +        out->a = ((val & 0xf000) >> 8) | ((val & 0xf000) >> 12);
   3.153 +}
   3.154 +
   3.155 +static void tex_ARGB4444_nowrap(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out)
   3.156 +{
   3.157 +        int offset = ((texture_state->u & 0x7fc0000) >> texture_state->texture_shift) +
   3.158 +                     (((texture_state->v & 0x7fc0000) >> texture_state->texture_shift) << texture_state->level);
   3.159 +        uint16_t val = state->texture[texture_state->level][offset];
   3.160 +
   3.161 +        if (((texture_state->u | texture_state->v) & 0xf8000000) == 0xf8000000)
   3.162 +                val = state->tex_bdr_clr;
   3.163 +
   3.164 +        out->r = ((val & 0x0f00) >> 4) | ((val & 0x0f00) >> 8);
   3.165 +        out->g = (val & 0x00f0) | ((val & 0x00f0) >> 4);
   3.166 +        out->b = ((val & 0x000f) << 4) | (val & 0x000f);
   3.167 +        out->a = ((val & 0xf000) >> 8) | ((val & 0xf000) >> 12);
   3.168 +}
   3.169 +
   3.170 +static void tex_ARGB8888(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out)
   3.171  {
   3.172          int offset = ((texture_state->u & 0x7fc0000) >> texture_state->texture_shift) +
   3.173                       (((texture_state->v & 0x7fc0000) >> texture_state->texture_shift) << texture_state->level);
   3.174          uint32_t val = ((uint32_t *)state->texture[texture_state->level])[offset];
   3.175  
   3.176 -        if (((texture_state->u | texture_state->v) & 0xf8000000) == 0xf8000000 && !(state->cmd_set & CMD_SET_TWE))
   3.177 +        out->r = (val >> 16) & 0xff;
   3.178 +        out->g = (val >> 8)  & 0xff;
   3.179 +        out->b =  val        & 0xff;
   3.180 +        out->a = (val >> 24) & 0xff;
   3.181 +}
   3.182 +static void tex_ARGB8888_nowrap(s3d_state_t *state, s3d_texture_state_t *texture_state, rgba_t *out)
   3.183 +{
   3.184 +        int offset = ((texture_state->u & 0x7fc0000) >> texture_state->texture_shift) +
   3.185 +                     (((texture_state->v & 0x7fc0000) >> texture_state->texture_shift) << texture_state->level);
   3.186 +        uint32_t val = ((uint32_t *)state->texture[texture_state->level])[offset];
   3.187 +
   3.188 +        if (((texture_state->u | texture_state->v) & 0xf8000000) == 0xf8000000)
   3.189                  val = state->tex_bdr_clr;
   3.190  
   3.191 -        *r_out = (val >> 16) & 0xff;
   3.192 -        *g_out = (val >> 8)  & 0xff;
   3.193 -        *b_out =  val        & 0xff;
   3.194 -        *a_out = (val >> 24) & 0xff;
   3.195 +        out->r = (val >> 16) & 0xff;
   3.196 +        out->g = (val >> 8)  & 0xff;
   3.197 +        out->b =  val        & 0xff;
   3.198 +        out->a = (val >> 24) & 0xff;
   3.199  }
   3.200  
   3.201 -static void tex_sample_normal(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.202 +static void tex_sample_normal(s3d_state_t *state)
   3.203  {
   3.204          s3d_texture_state_t texture_state;
   3.205          
   3.206 @@ -1626,14 +1676,14 @@
   3.207          texture_state.u = state->u + state->tbu;
   3.208          texture_state.v = state->v + state->tbv;
   3.209  
   3.210 -        tex_read(state, &texture_state, r_out, g_out, b_out, a_out);
   3.211 +        tex_read(state, &texture_state, &state->dest_rgba);
   3.212  }
   3.213  
   3.214 -static void tex_sample_normal_filter(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.215 +static void tex_sample_normal_filter(s3d_state_t *state)
   3.216  {
   3.217          s3d_texture_state_t texture_state;
   3.218          int tex_offset;
   3.219 -        int r[4], g[4], b[4], a[4];
   3.220 +        rgba_t tex_samples[4];
   3.221          int du, dv;
   3.222          int d[4];
   3.223  
   3.224 @@ -1643,34 +1693,34 @@
   3.225  
   3.226          texture_state.u = state->u + state->tbu;
   3.227          texture_state.v = state->v + state->tbv;
   3.228 -        tex_read(state, &texture_state, &r[0], &g[0], &b[0], &a[0]);
   3.229 +        tex_read(state, &texture_state, &tex_samples[0]);
   3.230          du = (texture_state.u >> (texture_state.texture_shift - 8)) & 0xff;
   3.231          dv = (texture_state.v >> (texture_state.texture_shift - 8)) & 0xff;
   3.232  
   3.233          texture_state.u = state->u + state->tbu + tex_offset;
   3.234          texture_state.v = state->v + state->tbv;
   3.235 -        tex_read(state, &texture_state, &r[1], &g[1], &b[1], &a[1]);
   3.236 +        tex_read(state, &texture_state, &tex_samples[1]);
   3.237  
   3.238          texture_state.u = state->u + state->tbu;
   3.239          texture_state.v = state->v + state->tbv + tex_offset;
   3.240 -        tex_read(state, &texture_state, &r[2], &g[2], &b[2], &a[2]);
   3.241 +        tex_read(state, &texture_state, &tex_samples[2]);
   3.242  
   3.243          texture_state.u = state->u + state->tbu + tex_offset;
   3.244          texture_state.v = state->v + state->tbv + tex_offset;
   3.245 -        tex_read(state, &texture_state, &r[3], &g[3], &b[3], &a[3]);
   3.246 +        tex_read(state, &texture_state, &tex_samples[3]);
   3.247          
   3.248          d[0] = (256 - du) * (256 - dv);
   3.249          d[1] =  du * (256 - dv);
   3.250          d[2] = (256 - du) * dv;
   3.251          d[3] = du * dv;
   3.252          
   3.253 -        *r_out = (r[0] * d[0] + r[1] * d[1] + r[2] * d[2] + r[3] * d[3]) >> 16;
   3.254 -        *g_out = (g[0] * d[0] + g[1] * d[1] + g[2] * d[2] + g[3] * d[3]) >> 16;
   3.255 -        *b_out = (b[0] * d[0] + b[1] * d[1] + b[2] * d[2] + b[3] * d[3]) >> 16;
   3.256 -        *a_out = (a[0] * d[0] + a[1] * d[1] + a[2] * d[2] + a[3] * d[3]) >> 16;
   3.257 +        state->dest_rgba.r = (tex_samples[0].r * d[0] + tex_samples[1].r * d[1] + tex_samples[2].r * d[2] + tex_samples[3].r * d[3]) >> 16;
   3.258 +        state->dest_rgba.g = (tex_samples[0].g * d[0] + tex_samples[1].g * d[1] + tex_samples[2].g * d[2] + tex_samples[3].g * d[3]) >> 16;
   3.259 +        state->dest_rgba.b = (tex_samples[0].b * d[0] + tex_samples[1].b * d[1] + tex_samples[2].b * d[2] + tex_samples[3].b * d[3]) >> 16;
   3.260 +        state->dest_rgba.a = (tex_samples[0].a * d[0] + tex_samples[1].a * d[1] + tex_samples[2].a * d[2] + tex_samples[3].a * d[3]) >> 16;
   3.261  }
   3.262  
   3.263 -static void tex_sample_mipmap(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.264 +static void tex_sample_mipmap(s3d_state_t *state)
   3.265  {
   3.266          s3d_texture_state_t texture_state;
   3.267  
   3.268 @@ -1679,14 +1729,14 @@
   3.269          texture_state.u = state->u + state->tbu;
   3.270          texture_state.v = state->v + state->tbv;
   3.271  
   3.272 -        tex_read(state, &texture_state, r_out, g_out, b_out, a_out);
   3.273 +        tex_read(state, &texture_state, &state->dest_rgba);
   3.274  }
   3.275  
   3.276 -static void tex_sample_mipmap_filter(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.277 +static void tex_sample_mipmap_filter(s3d_state_t *state)
   3.278  {
   3.279          s3d_texture_state_t texture_state;
   3.280          int tex_offset;
   3.281 -        int r[4], g[4], b[4], a[4];
   3.282 +        rgba_t tex_samples[4];
   3.283          int du, dv;
   3.284          int d[4];
   3.285  
   3.286 @@ -1696,34 +1746,34 @@
   3.287          
   3.288          texture_state.u = state->u + state->tbu;
   3.289          texture_state.v = state->v + state->tbv;
   3.290 -        tex_read(state, &texture_state, &r[0], &g[0], &b[0], &a[0]);
   3.291 +        tex_read(state, &texture_state, &tex_samples[0]);
   3.292          du = (texture_state.u >> (texture_state.texture_shift - 8)) & 0xff;
   3.293          dv = (texture_state.v >> (texture_state.texture_shift - 8)) & 0xff;
   3.294  
   3.295          texture_state.u = state->u + state->tbu + tex_offset;
   3.296          texture_state.v = state->v + state->tbv;
   3.297 -        tex_read(state, &texture_state, &r[1], &g[1], &b[1], &a[1]);
   3.298 +        tex_read(state, &texture_state, &tex_samples[1]);
   3.299  
   3.300          texture_state.u = state->u + state->tbu;
   3.301          texture_state.v = state->v + state->tbv + tex_offset;
   3.302 -        tex_read(state, &texture_state, &r[2], &g[2], &b[2], &a[2]);
   3.303 +        tex_read(state, &texture_state, &tex_samples[2]);
   3.304  
   3.305          texture_state.u = state->u + state->tbu + tex_offset;
   3.306          texture_state.v = state->v + state->tbv + tex_offset;
   3.307 -        tex_read(state, &texture_state, &r[3], &g[3], &b[3], &a[3]);
   3.308 +        tex_read(state, &texture_state, &tex_samples[3]);
   3.309  
   3.310          d[0] = (256 - du) * (256 - dv);
   3.311          d[1] =  du * (256 - dv);
   3.312          d[2] = (256 - du) * dv;
   3.313          d[3] = du * dv;
   3.314          
   3.315 -        *r_out = (r[0] * d[0] + r[1] * d[1] + r[2] * d[2] + r[3] * d[3]) >> 16;
   3.316 -        *g_out = (g[0] * d[0] + g[1] * d[1] + g[2] * d[2] + g[3] * d[3]) >> 16;
   3.317 -        *b_out = (b[0] * d[0] + b[1] * d[1] + b[2] * d[2] + b[3] * d[3]) >> 16;
   3.318 -        *a_out = (a[0] * d[0] + a[1] * d[1] + a[2] * d[2] + a[3] * d[3]) >> 16;
   3.319 +        state->dest_rgba.r = (tex_samples[0].r * d[0] + tex_samples[1].r * d[1] + tex_samples[2].r * d[2] + tex_samples[3].r * d[3]) >> 16;
   3.320 +        state->dest_rgba.g = (tex_samples[0].g * d[0] + tex_samples[1].g * d[1] + tex_samples[2].g * d[2] + tex_samples[3].g * d[3]) >> 16;
   3.321 +        state->dest_rgba.b = (tex_samples[0].b * d[0] + tex_samples[1].b * d[1] + tex_samples[2].b * d[2] + tex_samples[3].b * d[3]) >> 16;
   3.322 +        state->dest_rgba.a = (tex_samples[0].a * d[0] + tex_samples[1].a * d[1] + tex_samples[2].a * d[2] + tex_samples[3].a * d[3]) >> 16;
   3.323  }
   3.324  
   3.325 -static void tex_sample_persp_normal(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.326 +static void tex_sample_persp_normal(s3d_state_t *state)
   3.327  {
   3.328          s3d_texture_state_t texture_state;
   3.329          int32_t w = 0;
   3.330 @@ -1736,15 +1786,15 @@
   3.331          texture_state.u = (int32_t)(((int64_t)state->u * (int64_t)w) >> (12 + state->max_d)) + state->tbu;
   3.332          texture_state.v = (int32_t)(((int64_t)state->v * (int64_t)w) >> (12 + state->max_d)) + state->tbv;
   3.333  
   3.334 -        tex_read(state, &texture_state, r_out, g_out, b_out, a_out);
   3.335 +        tex_read(state, &texture_state, &state->dest_rgba);
   3.336  }
   3.337  
   3.338 -static void tex_sample_persp_normal_filter(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.339 +static void tex_sample_persp_normal_filter(s3d_state_t *state)
   3.340  {
   3.341          s3d_texture_state_t texture_state;
   3.342          int32_t w = 0, u, v;
   3.343          int tex_offset;
   3.344 -        int r[4], g[4], b[4], a[4];
   3.345 +        rgba_t tex_samples[4];
   3.346          int du, dv;
   3.347          int d[4];
   3.348  
   3.349 @@ -1760,34 +1810,34 @@
   3.350          
   3.351          texture_state.u = u;
   3.352          texture_state.v = v;
   3.353 -        tex_read(state, &texture_state, &r[0], &g[0], &b[0], &a[0]);
   3.354 +        tex_read(state, &texture_state, &tex_samples[0]);
   3.355          du = (u >> (texture_state.texture_shift - 8)) & 0xff;
   3.356          dv = (v >> (texture_state.texture_shift - 8)) & 0xff;
   3.357  
   3.358          texture_state.u = u + tex_offset;
   3.359          texture_state.v = v;
   3.360 -        tex_read(state, &texture_state, &r[1], &g[1], &b[1], &a[1]);
   3.361 +        tex_read(state, &texture_state, &tex_samples[1]);
   3.362  
   3.363          texture_state.u = u;
   3.364          texture_state.v = v + tex_offset;
   3.365 -        tex_read(state, &texture_state, &r[2], &g[2], &b[2], &a[2]);
   3.366 +        tex_read(state, &texture_state, &tex_samples[2]);
   3.367  
   3.368          texture_state.u = u + tex_offset;
   3.369          texture_state.v = v + tex_offset;
   3.370 -        tex_read(state, &texture_state, &r[3], &g[3], &b[3], &a[3]);
   3.371 +        tex_read(state, &texture_state, &tex_samples[3]);
   3.372  
   3.373          d[0] = (256 - du) * (256 - dv);
   3.374          d[1] =  du * (256 - dv);
   3.375          d[2] = (256 - du) * dv;
   3.376          d[3] = du * dv;
   3.377          
   3.378 -        *r_out = (r[0] * d[0] + r[1] * d[1] + r[2] * d[2] + r[3] * d[3]) >> 16;
   3.379 -        *g_out = (g[0] * d[0] + g[1] * d[1] + g[2] * d[2] + g[3] * d[3]) >> 16;
   3.380 -        *b_out = (b[0] * d[0] + b[1] * d[1] + b[2] * d[2] + b[3] * d[3]) >> 16;
   3.381 -        *a_out = (a[0] * d[0] + a[1] * d[1] + a[2] * d[2] + a[3] * d[3]) >> 16;
   3.382 +        state->dest_rgba.r = (tex_samples[0].r * d[0] + tex_samples[1].r * d[1] + tex_samples[2].r * d[2] + tex_samples[3].r * d[3]) >> 16;
   3.383 +        state->dest_rgba.g = (tex_samples[0].g * d[0] + tex_samples[1].g * d[1] + tex_samples[2].g * d[2] + tex_samples[3].g * d[3]) >> 16;
   3.384 +        state->dest_rgba.b = (tex_samples[0].b * d[0] + tex_samples[1].b * d[1] + tex_samples[2].b * d[2] + tex_samples[3].b * d[3]) >> 16;
   3.385 +        state->dest_rgba.a = (tex_samples[0].a * d[0] + tex_samples[1].a * d[1] + tex_samples[2].a * d[2] + tex_samples[3].a * d[3]) >> 16;
   3.386  }
   3.387  
   3.388 -static void tex_sample_persp_normal_375(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.389 +static void tex_sample_persp_normal_375(s3d_state_t *state)
   3.390  {
   3.391          s3d_texture_state_t texture_state;
   3.392          int32_t w = 0;
   3.393 @@ -1800,15 +1850,15 @@
   3.394          texture_state.u = (int32_t)(((int64_t)state->u * (int64_t)w) >> (8 + state->max_d)) + state->tbu;
   3.395          texture_state.v = (int32_t)(((int64_t)state->v * (int64_t)w) >> (8 + state->max_d)) + state->tbv;
   3.396  
   3.397 -        tex_read(state, &texture_state, r_out, g_out, b_out, a_out);
   3.398 +        tex_read(state, &texture_state, &state->dest_rgba);
   3.399  }
   3.400  
   3.401 -static void tex_sample_persp_normal_filter_375(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.402 +static void tex_sample_persp_normal_filter_375(s3d_state_t *state)
   3.403  {
   3.404          s3d_texture_state_t texture_state;
   3.405          int32_t w = 0, u, v;
   3.406          int tex_offset;
   3.407 -        int r[4], g[4], b[4], a[4];
   3.408 +        rgba_t tex_samples[4];
   3.409          int du, dv;
   3.410          int d[4];
   3.411  
   3.412 @@ -1824,35 +1874,35 @@
   3.413  
   3.414          texture_state.u = u;
   3.415          texture_state.v = v;
   3.416 -        tex_read(state, &texture_state, &r[0], &g[0], &b[0], &a[0]);
   3.417 +        tex_read(state, &texture_state, &tex_samples[0]);
   3.418          du = (u >> (texture_state.texture_shift - 8)) & 0xff;
   3.419          dv = (v >> (texture_state.texture_shift - 8)) & 0xff;
   3.420  
   3.421          texture_state.u = u + tex_offset;
   3.422          texture_state.v = v;
   3.423 -        tex_read(state, &texture_state, &r[1], &g[1], &b[1], &a[1]);
   3.424 +        tex_read(state, &texture_state, &tex_samples[1]);
   3.425  
   3.426          texture_state.u = u;
   3.427          texture_state.v = v + tex_offset;
   3.428 -        tex_read(state, &texture_state, &r[2], &g[2], &b[2], &a[2]);
   3.429 +        tex_read(state, &texture_state, &tex_samples[2]);
   3.430  
   3.431          texture_state.u = u + tex_offset;
   3.432          texture_state.v = v + tex_offset;
   3.433 -        tex_read(state, &texture_state, &r[3], &g[3], &b[3], &a[3]);
   3.434 +        tex_read(state, &texture_state, &tex_samples[3]);
   3.435  
   3.436          d[0] = (256 - du) * (256 - dv);
   3.437          d[1] =  du * (256 - dv);
   3.438          d[2] = (256 - du) * dv;
   3.439          d[3] = du * dv;
   3.440          
   3.441 -        *r_out = (r[0] * d[0] + r[1] * d[1] + r[2] * d[2] + r[3] * d[3]) >> 16;
   3.442 -        *g_out = (g[0] * d[0] + g[1] * d[1] + g[2] * d[2] + g[3] * d[3]) >> 16;
   3.443 -        *b_out = (b[0] * d[0] + b[1] * d[1] + b[2] * d[2] + b[3] * d[3]) >> 16;
   3.444 -        *a_out = (a[0] * d[0] + a[1] * d[1] + a[2] * d[2] + a[3] * d[3]) >> 16;
   3.445 +        state->dest_rgba.r = (tex_samples[0].r * d[0] + tex_samples[1].r * d[1] + tex_samples[2].r * d[2] + tex_samples[3].r * d[3]) >> 16;
   3.446 +        state->dest_rgba.g = (tex_samples[0].g * d[0] + tex_samples[1].g * d[1] + tex_samples[2].g * d[2] + tex_samples[3].g * d[3]) >> 16;
   3.447 +        state->dest_rgba.b = (tex_samples[0].b * d[0] + tex_samples[1].b * d[1] + tex_samples[2].b * d[2] + tex_samples[3].b * d[3]) >> 16;
   3.448 +        state->dest_rgba.a = (tex_samples[0].a * d[0] + tex_samples[1].a * d[1] + tex_samples[2].a * d[2] + tex_samples[3].a * d[3]) >> 16;
   3.449  }
   3.450  
   3.451  
   3.452 -static void tex_sample_persp_mipmap(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.453 +static void tex_sample_persp_mipmap(s3d_state_t *state)
   3.454  {
   3.455          s3d_texture_state_t texture_state;
   3.456          int32_t w = 0;
   3.457 @@ -1865,15 +1915,15 @@
   3.458          texture_state.u = (int32_t)(((int64_t)state->u * (int64_t)w) >> (12 + state->max_d)) + state->tbu;
   3.459          texture_state.v = (int32_t)(((int64_t)state->v * (int64_t)w) >> (12 + state->max_d)) + state->tbv;
   3.460  
   3.461 -        tex_read(state, &texture_state, r_out, g_out, b_out, a_out);
   3.462 +        tex_read(state, &texture_state, &state->dest_rgba);
   3.463  }
   3.464  
   3.465 -static void tex_sample_persp_mipmap_filter(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.466 +static void tex_sample_persp_mipmap_filter(s3d_state_t *state)
   3.467  {
   3.468          s3d_texture_state_t texture_state;
   3.469          int32_t w = 0, u, v;
   3.470          int tex_offset;
   3.471 -        int r[4], g[4], b[4], a[4];
   3.472 +        rgba_t tex_samples[4];
   3.473          int du, dv;
   3.474          int d[4];
   3.475  
   3.476 @@ -1889,34 +1939,34 @@
   3.477  
   3.478          texture_state.u = u;
   3.479          texture_state.v = v;
   3.480 -        tex_read(state, &texture_state, &r[0], &g[0], &b[0], &a[0]);
   3.481 +        tex_read(state, &texture_state, &tex_samples[0]);
   3.482          du = (u >> (texture_state.texture_shift - 8)) & 0xff;
   3.483          dv = (v >> (texture_state.texture_shift - 8)) & 0xff;
   3.484  
   3.485          texture_state.u = u + tex_offset;
   3.486          texture_state.v = v;
   3.487 -        tex_read(state, &texture_state, &r[1], &g[1], &b[1], &a[1]);
   3.488 +        tex_read(state, &texture_state, &tex_samples[1]);
   3.489  
   3.490          texture_state.u = u;
   3.491          texture_state.v = v + tex_offset;
   3.492 -        tex_read(state, &texture_state, &r[2], &g[2], &b[2], &a[2]);
   3.493 +        tex_read(state, &texture_state, &tex_samples[2]);
   3.494  
   3.495          texture_state.u = u + tex_offset;
   3.496          texture_state.v = v + tex_offset;
   3.497 -        tex_read(state, &texture_state, &r[3], &g[3], &b[3], &a[3]);
   3.498 +        tex_read(state, &texture_state, &tex_samples[3]);
   3.499  
   3.500          d[0] = (256 - du) * (256 - dv);
   3.501          d[1] =  du * (256 - dv);
   3.502          d[2] = (256 - du) * dv;
   3.503          d[3] = du * dv;
   3.504          
   3.505 -        *r_out = (r[0] * d[0] + r[1] * d[1] + r[2] * d[2] + r[3] * d[3]) >> 16;
   3.506 -        *g_out = (g[0] * d[0] + g[1] * d[1] + g[2] * d[2] + g[3] * d[3]) >> 16;
   3.507 -        *b_out = (b[0] * d[0] + b[1] * d[1] + b[2] * d[2] + b[3] * d[3]) >> 16;
   3.508 -        *a_out = (a[0] * d[0] + a[1] * d[1] + a[2] * d[2] + a[3] * d[3]) >> 16;
   3.509 +        state->dest_rgba.r = (tex_samples[0].r * d[0] + tex_samples[1].r * d[1] + tex_samples[2].r * d[2] + tex_samples[3].r * d[3]) >> 16;
   3.510 +        state->dest_rgba.g = (tex_samples[0].g * d[0] + tex_samples[1].g * d[1] + tex_samples[2].g * d[2] + tex_samples[3].g * d[3]) >> 16;
   3.511 +        state->dest_rgba.b = (tex_samples[0].b * d[0] + tex_samples[1].b * d[1] + tex_samples[2].b * d[2] + tex_samples[3].b * d[3]) >> 16;
   3.512 +        state->dest_rgba.a = (tex_samples[0].a * d[0] + tex_samples[1].a * d[1] + tex_samples[2].a * d[2] + tex_samples[3].a * d[3]) >> 16;
   3.513  }
   3.514  
   3.515 -static void tex_sample_persp_mipmap_375(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.516 +static void tex_sample_persp_mipmap_375(s3d_state_t *state)
   3.517  {
   3.518          s3d_texture_state_t texture_state;
   3.519          int32_t w = 0;
   3.520 @@ -1929,15 +1979,15 @@
   3.521          texture_state.u = (int32_t)(((int64_t)state->u * (int64_t)w) >> (8 + state->max_d)) + state->tbu;
   3.522          texture_state.v = (int32_t)(((int64_t)state->v * (int64_t)w) >> (8 + state->max_d)) + state->tbv;
   3.523  
   3.524 -        tex_read(state, &texture_state, r_out, g_out, b_out, a_out);
   3.525 +        tex_read(state, &texture_state, &state->dest_rgba);
   3.526  }
   3.527  
   3.528 -static void tex_sample_persp_mipmap_filter_375(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.529 +static void tex_sample_persp_mipmap_filter_375(s3d_state_t *state)
   3.530  {
   3.531          s3d_texture_state_t texture_state;
   3.532          int32_t w = 0, u, v;
   3.533          int tex_offset;
   3.534 -        int r[4], g[4], b[4], a[4];
   3.535 +        rgba_t tex_samples[4];
   3.536          int du, dv;
   3.537          int d[4];
   3.538  
   3.539 @@ -1953,55 +2003,51 @@
   3.540          
   3.541          texture_state.u = u;
   3.542          texture_state.v = v;
   3.543 -        tex_read(state, &texture_state, &r[0], &g[0], &b[0], &a[0]);
   3.544 +        tex_read(state, &texture_state, &tex_samples[0]);
   3.545          du = (u >> (texture_state.texture_shift - 8)) & 0xff;
   3.546          dv = (v >> (texture_state.texture_shift - 8)) & 0xff;
   3.547  
   3.548          texture_state.u = u + tex_offset;
   3.549          texture_state.v = v;
   3.550 -        tex_read(state, &texture_state, &r[1], &g[1], &b[1], &a[1]);
   3.551 +        tex_read(state, &texture_state, &tex_samples[1]);
   3.552  
   3.553          texture_state.u = u;
   3.554          texture_state.v = v + tex_offset;
   3.555 -        tex_read(state, &texture_state, &r[2], &g[2], &b[2], &a[2]);
   3.556 +        tex_read(state, &texture_state, &tex_samples[2]);
   3.557  
   3.558          texture_state.u = u + tex_offset;
   3.559          texture_state.v = v + tex_offset;
   3.560 -        tex_read(state, &texture_state, &r[3], &g[3], &b[3], &a[3]);
   3.561 +        tex_read(state, &texture_state, &tex_samples[3]);
   3.562  
   3.563          d[0] = (256 - du) * (256 - dv);
   3.564          d[1] =  du * (256 - dv);
   3.565          d[2] = (256 - du) * dv;
   3.566          d[3] = du * dv;
   3.567          
   3.568 -        *r_out = (r[0] * d[0] + r[1] * d[1] + r[2] * d[2] + r[3] * d[3]) >> 16;
   3.569 -        *g_out = (g[0] * d[0] + g[1] * d[1] + g[2] * d[2] + g[3] * d[3]) >> 16;
   3.570 -        *b_out = (b[0] * d[0] + b[1] * d[1] + b[2] * d[2] + b[3] * d[3]) >> 16;
   3.571 -        *a_out = (a[0] * d[0] + a[1] * d[1] + a[2] * d[2] + a[3] * d[3]) >> 16;
   3.572 +        state->dest_rgba.r = (tex_samples[0].r * d[0] + tex_samples[1].r * d[1] + tex_samples[2].r * d[2] + tex_samples[3].r * d[3]) >> 16;
   3.573 +        state->dest_rgba.g = (tex_samples[0].g * d[0] + tex_samples[1].g * d[1] + tex_samples[2].g * d[2] + tex_samples[3].g * d[3]) >> 16;
   3.574 +        state->dest_rgba.b = (tex_samples[0].b * d[0] + tex_samples[1].b * d[1] + tex_samples[2].b * d[2] + tex_samples[3].b * d[3]) >> 16;
   3.575 +        state->dest_rgba.a = (tex_samples[0].a * d[0] + tex_samples[1].a * d[1] + tex_samples[2].a * d[2] + tex_samples[3].a * d[3]) >> 16;
   3.576  }
   3.577  
   3.578  
   3.579 -#define CLAMP_RGBA(r, g, b, a) do       \
   3.580 -        {                               \
   3.581 -                if ((r) < 0)            \
   3.582 -                        r = 0;          \
   3.583 -                if ((r) > 0xff)         \
   3.584 -                        r = 0xff;       \
   3.585 -                if ((g) < 0)            \
   3.586 -                        g = 0;          \
   3.587 -                if ((g) > 0xff)         \
   3.588 -                        g = 0xff;       \
   3.589 -                if ((b) < 0)            \
   3.590 -                        b = 0;          \
   3.591 -                if ((b) > 0xff)         \
   3.592 -                        b = 0xff;       \
   3.593 -                if ((a) < 0)            \
   3.594 -                        a = 0;          \
   3.595 -                if ((a) > 0xff)         \
   3.596 -                        a = 0xff;       \
   3.597 +#define CLAMP(x) do                                     \
   3.598 +        {                                               \
   3.599 +                if ((x) & ~0xff)                        \
   3.600 +                        x = ((x) < 0) ? 0 : 0xff;       \
   3.601          }                               \
   3.602          while (0)
   3.603  
   3.604 +#define CLAMP_RGBA(r, g, b, a)        \
   3.605 +                if ((r) & ~0xff)                        \
   3.606 +                        r = ((r) < 0) ? 0 : 0xff;       \
   3.607 +                if ((g) & ~0xff)                        \
   3.608 +                        g = ((g) < 0) ? 0 : 0xff;       \
   3.609 +                if ((b) & ~0xff)                        \
   3.610 +                        b = ((b) < 0) ? 0 : 0xff;       \
   3.611 +                if ((a) & ~0xff)                        \
   3.612 +                        a = ((a) < 0) ? 0 : 0xff;
   3.613 +        
   3.614  #define CLAMP_RGB(r, g, b) do           \
   3.615          {                               \
   3.616                  if ((r) < 0)            \
   3.617 @@ -2019,67 +2065,64 @@
   3.618          }                               \
   3.619          while (0)
   3.620  
   3.621 -static void dest_pixel_gouraud_shaded_triangle(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.622 +static void dest_pixel_gouraud_shaded_triangle(s3d_state_t *state)
   3.623  {
   3.624 -        *r_out = state->r >> 7;
   3.625 -        *g_out = state->g >> 7;
   3.626 -        *b_out = state->b >> 7;
   3.627 -        *a_out = state->a >> 7;
   3.628 -        CLAMP_RGBA(*r_out, *g_out, *b_out, *a_out);
   3.629 +        state->dest_rgba.r = state->r >> 7;
   3.630 +        CLAMP(state->dest_rgba.r);
   3.631 +
   3.632 +        state->dest_rgba.g = state->g >> 7;
   3.633 +        CLAMP(state->dest_rgba.g);
   3.634 +
   3.635 +        state->dest_rgba.b = state->b >> 7;
   3.636 +        CLAMP(state->dest_rgba.b);
   3.637 +
   3.638 +        state->dest_rgba.a = state->a >> 7;
   3.639 +        CLAMP(state->dest_rgba.a);
   3.640  }
   3.641  
   3.642 -static void dest_pixel_unlit_texture_triangle(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.643 +static void dest_pixel_unlit_texture_triangle(s3d_state_t *state)
   3.644  {
   3.645 -        tex_sample(state, r_out, g_out, b_out, a_out);
   3.646 +        tex_sample(state);
   3.647  
   3.648          if (state->cmd_set & CMD_SET_ABC_SRC)
   3.649 -                *a_out = state->a >> 7;
   3.650 +                state->dest_rgba.a = state->a >> 7;
   3.651  }
   3.652  
   3.653 -static void dest_pixel_lit_texture_decal(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.654 +static void dest_pixel_lit_texture_decal(s3d_state_t *state)
   3.655  {
   3.656 -        tex_sample(state, r_out, g_out, b_out, a_out);
   3.657 +        tex_sample(state);
   3.658  
   3.659          if (state->cmd_set & CMD_SET_ABC_SRC)
   3.660 -                *a_out = state->a >> 7;
   3.661 +                state->dest_rgba.a = state->a >> 7;
   3.662  }
   3.663  
   3.664 -static void dest_pixel_lit_texture_reflection(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.665 +static void dest_pixel_lit_texture_reflection(s3d_state_t *state)
   3.666  {
   3.667 -        int tex_r, tex_g, tex_b, tex_a;
   3.668 -        
   3.669 -        tex_sample(state, &tex_r, &tex_g, &tex_b, &tex_a);
   3.670 +        tex_sample(state);
   3.671  
   3.672 -        *r_out = state->r >> 7;
   3.673 -        *g_out = state->g >> 7;
   3.674 -        *b_out = state->b >> 7;
   3.675 -        *a_out = state->a >> 7;
   3.676 -        CLAMP_RGBA(*r_out, *g_out, *b_out, *a_out);
   3.677 +        state->dest_rgba.r += (state->r >> 7);
   3.678 +        state->dest_rgba.g += (state->g >> 7);
   3.679 +        state->dest_rgba.b += (state->b >> 7);
   3.680 +        if (state->cmd_set & CMD_SET_ABC_SRC)
   3.681 +                state->dest_rgba.a += (state->a >> 7);
   3.682  
   3.683 -        *(r_out) += tex_r;
   3.684 -        *(g_out) += tex_g;
   3.685 -        *(b_out) += tex_b;
   3.686 -
   3.687 -        CLAMP_RGB(*r_out, *g_out, *b_out);
   3.688 -
   3.689 -        if (!(state->cmd_set & CMD_SET_ABC_SRC))
   3.690 -                *a_out = tex_a;
   3.691 +        CLAMP_RGBA(state->dest_rgba.r, state->dest_rgba.g, state->dest_rgba.b, state->dest_rgba.a);
   3.692  }
   3.693  
   3.694 -static void dest_pixel_lit_texture_modulate(s3d_state_t *state, int *r_out, int *g_out, int *b_out, int *a_out)
   3.695 +static void dest_pixel_lit_texture_modulate(s3d_state_t *state)
   3.696  {
   3.697          int r = state->r >> 7, g = state->g >> 7, b = state->b >> 7, a = state->a >> 7;
   3.698          
   3.699 -        tex_sample(state, r_out, g_out, b_out, a_out);
   3.700 +        tex_sample(state);
   3.701          
   3.702          CLAMP_RGBA(r, g, b, a);
   3.703          
   3.704 -        *r_out = ((*r_out) * r) >> 8;
   3.705 -        *g_out = ((*g_out) * g) >> 8;
   3.706 -        *b_out = ((*b_out) * b) >> 8;
   3.707 +        state->dest_rgba.r = ((state->dest_rgba.r) * r) >> 8;
   3.708 +        state->dest_rgba.g = ((state->dest_rgba.g) * g) >> 8;
   3.709 +        state->dest_rgba.b = ((state->dest_rgba.b) * b) >> 8;
   3.710  
   3.711          if (state->cmd_set & CMD_SET_ABC_SRC)
   3.712 -                *a_out = a;               
   3.713 +                state->dest_rgba.a = a;
   3.714  }
   3.715  
   3.716  static void tri(virge_t *virge, s3d_state_t *state, int yc, int32_t dx1, int32_t dx2)
   3.717 @@ -2094,9 +2137,40 @@
   3.718          
   3.719          int bpp = 1;
   3.720          
   3.721 -        uint32_t dest_offset = virge->s3d.dest_base + (state->y * virge->s3d.dest_str);
   3.722 -        uint32_t z_offset = virge->s3d.z_base + (state->y * virge->s3d.z_str);
   3.723 -                
   3.724 +        uint32_t dest_offset, z_offset;
   3.725 +
   3.726 +        if (virge->s3d.cmd_set & CMD_SET_HC)
   3.727 +        {
   3.728 +                if (state->y < virge->s3d.clip_t)
   3.729 +                        return;
   3.730 +                if (state->y > virge->s3d.clip_b)
   3.731 +                {
   3.732 +                        int diff_y = state->y - virge->s3d.clip_b;
   3.733 +                        
   3.734 +                        if (diff_y > y_count)
   3.735 +                                diff_y = y_count;
   3.736 +                        
   3.737 +                        state->base_u += (virge->s3d.TdUdY * diff_y);
   3.738 +                        state->base_v += (virge->s3d.TdVdY * diff_y);
   3.739 +                        state->base_z += (virge->s3d.TdZdY * diff_y);
   3.740 +                        state->base_r += (virge->s3d.TdRdY * diff_y);
   3.741 +                        state->base_g += (virge->s3d.TdGdY * diff_y);
   3.742 +                        state->base_b += (virge->s3d.TdBdY * diff_y);
   3.743 +                        state->base_a += (virge->s3d.TdAdY * diff_y);
   3.744 +                        state->base_d += (virge->s3d.TdDdY * diff_y);
   3.745 +                        state->base_w += (virge->s3d.TdWdY * diff_y);
   3.746 +                        state->x1 += (dx1 * diff_y);
   3.747 +                        state->x2 += (dx2 * diff_y);
   3.748 +                        state->y -= diff_y;
   3.749 +                        dest_offset -= virge->s3d.dest_str;
   3.750 +                        z_offset -= virge->s3d.z_str;
   3.751 +                        y_count -= diff_y;
   3.752 +                }
   3.753 +        }
   3.754 +
   3.755 +        dest_offset = virge->s3d.dest_base + (state->y * virge->s3d.dest_str);
   3.756 +        z_offset = virge->s3d.z_base + (state->y * virge->s3d.z_str);
   3.757 +        
   3.758          for (; y_count > 0; y_count--)
   3.759          {
   3.760                  int x = state->x1 >> 20;
   3.761 @@ -2104,7 +2178,10 @@
   3.762                  uint32_t z = state->base_z;
   3.763                  if (x != xe && (x_dir > 0 && x < xe) || (x_dir < 0 && x > xe))
   3.764                  {
   3.765 +                        uint32_t dest_addr, z_addr;
   3.766                          int dx = (x_dir > 0) ? 8 - ((state->x1 >> 16) & 0xf) : ((state->x1 >> 16) & 0xf) - 8;
   3.767 +                        int x_offset = x_dir << bpp;
   3.768 +                        
   3.769                          state->r = state->base_r + ((virge->s3d.TdRdX * dx) >> 4);
   3.770                          state->g = state->base_g + ((virge->s3d.TdGdX * dx) >> 4);
   3.771                          state->b = state->base_b + ((virge->s3d.TdBdX * dx) >> 4);
   3.772 @@ -2116,10 +2193,67 @@
   3.773                          z += ((virge->s3d.TdZdX * dx) >> 4);
   3.774  //                        pclog("Draw Y=%i X=%i to XE=%i  %i   %08x %08x %08x %08x  %08x %08x %08x %08x  %i %08x\n", state->y, x, xe, dx, state->x1, state->x2, dx1, virge->s3d.TdWdX, state->u, state->v, virge->s3d.TdUdX, virge->s3d.TdUdY, dx, (virge->s3d.TdUdX * dx) >> 4);
   3.775  
   3.776 +                        if (virge->s3d.cmd_set & CMD_SET_HC)
   3.777 +                        {
   3.778 +                                if (x_dir > 0)
   3.779 +                                {
   3.780 +                                        if (x > virge->s3d.clip_r)
   3.781 +                                                goto tri_skip_line;
   3.782 +                                        if (xe < virge->s3d.clip_l)
   3.783 +                                                goto tri_skip_line;
   3.784 +                                        if (xe > virge->s3d.clip_r)
   3.785 +                                                xe = virge->s3d.clip_r;
   3.786 +                                        if (x < virge->s3d.clip_l)
   3.787 +                                        {
   3.788 +                                                int diff_x = virge->s3d.clip_l - x;
   3.789 +                                                
   3.790 +                                                z += (virge->s3d.TdZdX * diff_x);
   3.791 +                                                state->u += (virge->s3d.TdUdX * diff_x);
   3.792 +                                                state->v += (virge->s3d.TdVdX * diff_x);
   3.793 +                                                state->r += (virge->s3d.TdRdX * diff_x);
   3.794 +                                                state->g += (virge->s3d.TdGdX * diff_x);
   3.795 +                                                state->b += (virge->s3d.TdBdX * diff_x);
   3.796 +                                                state->a += (virge->s3d.TdAdX * diff_x);
   3.797 +                                                state->d += (virge->s3d.TdDdX * diff_x);
   3.798 +                                                state->w += (virge->s3d.TdWdX * diff_x);
   3.799 +                                                
   3.800 +                                                x = virge->s3d.clip_l;
   3.801 +                                        }
   3.802 +                                }
   3.803 +                                else
   3.804 +                                {
   3.805 +                                        if (x < virge->s3d.clip_l)
   3.806 +                                                goto tri_skip_line;
   3.807 +                                        if (xe > virge->s3d.clip_r)
   3.808 +                                                goto tri_skip_line;
   3.809 +                                        if (xe < virge->s3d.clip_l)
   3.810 +                                                xe = virge->s3d.clip_l;
   3.811 +                                        if (x > virge->s3d.clip_r)
   3.812 +                                        {
   3.813 +                                                int diff_x = x - virge->s3d.clip_r;
   3.814 +                                                
   3.815 +                                                z += (virge->s3d.TdZdX * diff_x);
   3.816 +                                                state->u += (virge->s3d.TdUdX * diff_x);
   3.817 +                                                state->v += (virge->s3d.TdVdX * diff_x);
   3.818 +                                                state->r += (virge->s3d.TdRdX * diff_x);
   3.819 +                                                state->g += (virge->s3d.TdGdX * diff_x);
   3.820 +                                                state->b += (virge->s3d.TdBdX * diff_x);
   3.821 +                                                state->a += (virge->s3d.TdAdX * diff_x);
   3.822 +                                                state->d += (virge->s3d.TdDdX * diff_x);
   3.823 +                                                state->w += (virge->s3d.TdWdX * diff_x);
   3.824 +                                                
   3.825 +                                                x = virge->s3d.clip_r;
   3.826 +                                        }
   3.827 +                                }
   3.828 +                        }
   3.829 +
   3.830 +                        virge->svga.changedvram[(dest_offset & 0x3fffff) >> 12] = changeframecount;
   3.831 +
   3.832 +                        dest_addr = dest_offset + (x << bpp);
   3.833 +                        z_addr = z_offset + (x << bpp);
   3.834 +
   3.835                          for (; x != ((xe + x_dir) & 0xfff); x = (x + x_dir) & 0xfff)
   3.836                          {
   3.837 -                                uint32_t dest_addr = dest_offset + (x << bpp);
   3.838 -                                uint32_t z_addr = z_offset + (x << bpp);
   3.839                                  int update = 1;
   3.840                                  int16_t src_z;
   3.841                                  _x = x; _y = state->y;
   3.842 @@ -2129,14 +2263,12 @@
   3.843                                          src_z = Z_READ(z_addr);
   3.844                                          Z_CLIP(src_z, z >> 16);
   3.845                                  }
   3.846 -                                CLIP(x, state->y);
   3.847  
   3.848                                  if (update)
   3.849                                  {
   3.850 -                                        int dest_r, dest_g, dest_b, dest_a;
   3.851                                          uint32_t dest_col;
   3.852  
   3.853 -                                        dest_pixel(state, &dest_r, &dest_g, &dest_b, &dest_a);
   3.854 +                                        dest_pixel(state);
   3.855  
   3.856                                          if (virge->s3d.cmd_set & CMD_SET_ABC_ENABLE)
   3.857                                          {
   3.858 @@ -2158,9 +2290,9 @@
   3.859                                                          break;
   3.860                                                  }
   3.861  
   3.862 -                                                dest_r = ((dest_r * dest_a) + (src_r * (255 - dest_a))) / 255;
   3.863 -                                                dest_g = ((dest_g * dest_a) + (src_g * (255 - dest_a))) / 255;
   3.864 -                                                dest_b = ((dest_b * dest_a) + (src_b * (255 - dest_a))) / 255;
   3.865 +                                                state->dest_rgba.r = ((state->dest_rgba.r * state->dest_rgba.a) + (src_r * (255 - state->dest_rgba.a))) / 255;
   3.866 +                                                state->dest_rgba.g = ((state->dest_rgba.g * state->dest_rgba.a) + (src_g * (255 - state->dest_rgba.a))) / 255;
   3.867 +                                                state->dest_rgba.b = ((state->dest_rgba.b * state->dest_rgba.a) + (src_b * (255 - state->dest_rgba.a))) / 255;
   3.868                                          }
   3.869  
   3.870                                          switch (bpp)
   3.871 @@ -2169,14 +2301,12 @@
   3.872                                                  /*Not implemented yet*/
   3.873                                                  break;
   3.874                                                  case 1: /*16 bpp*/
   3.875 -                                                dest_col = RGB15(dest_r, dest_g, dest_b);
   3.876 -                                                *(uint16_t *)&vram[dest_addr & 0x3fffff] = dest_col;
   3.877 -                                                virge->svga.changedvram[(dest_addr & 0x3fffff) >> 12] = changeframecount;
   3.878 +                                                dest_col = RGB15(state->dest_rgba.r, state->dest_rgba.g, state->dest_rgba.b);
   3.879 +                                                *(uint16_t *)&vram[dest_addr] = dest_col;
   3.880                                                  break;
   3.881                                                  case 2: /*24 bpp*/
   3.882 -                                                dest_col = RGB24(dest_r, dest_g, dest_b);
   3.883 -                                                *(uint32_t *)&vram[dest_addr & 0x3fffff] = dest_col;
   3.884 -                                                virge->svga.changedvram[(dest_addr & 0x3fffff) >> 12] = changeframecount;
   3.885 +                                                dest_col = RGB24(state->dest_rgba.r, state->dest_rgba.g, state->dest_rgba.b);
   3.886 +                                                *(uint32_t *)&vram[dest_addr] = dest_col;
   3.887                                                  break;
   3.888                                          }
   3.889  
   3.890 @@ -2193,9 +2323,12 @@
   3.891                                  state->a += virge->s3d.TdAdX;
   3.892                                  state->d += virge->s3d.TdDdX;
   3.893                                  state->w += virge->s3d.TdWdX;
   3.894 +                                dest_addr += x_offset;
   3.895 +                                z_addr += x_offset;
   3.896                                  virge->pixel_count++;
   3.897                          }
   3.898                  }
   3.899 +tri_skip_line:
   3.900                  state->x1 += dx1;
   3.901                  state->x2 += dx2;
   3.902                  state->base_u += virge->s3d.TdUdY;
   3.903 @@ -2231,7 +2364,10 @@
   3.904  
   3.905          uint32_t tex_base;
   3.906          int c;
   3.907 -        
   3.908 +
   3.909 +        uint64_t start_time = timer_read();
   3.910 +        uint64_t end_time;
   3.911 +
   3.912          state.tbu = virge->s3d.tbu << 11;
   3.913          state.tbv = virge->s3d.tbv << 11;
   3.914          
   3.915 @@ -2347,19 +2483,20 @@
   3.916          switch ((virge->s3d.cmd_set >> 5) & 7)
   3.917          {
   3.918                  case 0:
   3.919 -                tex_read = tex_ARGB8888;
   3.920 +                tex_read = (virge->s3d.cmd_set & CMD_SET_TWE) ? tex_ARGB8888 : tex_ARGB8888_nowrap;
   3.921                  break;
   3.922                  case 1:
   3.923 -                tex_read = tex_ARGB4444;
   3.924 +                tex_read = (virge->s3d.cmd_set & CMD_SET_TWE) ? tex_ARGB4444 : tex_ARGB4444_nowrap;
   3.925  //                pclog("tex_ARGB4444\n");
   3.926                  break;
   3.927                  case 2:
   3.928 -                tex_read = tex_ARGB1555;
   3.929 +                tex_read = (virge->s3d.cmd_set & CMD_SET_TWE) ? tex_ARGB1555 : tex_ARGB1555_nowrap;
   3.930  //                pclog("tex_ARGB1555 %i\n", (virge->s3d.cmd_set >> 5) & 7);
   3.931                  break;
   3.932                  default:
   3.933                  pclog("bad texture type %i\n", (virge->s3d.cmd_set >> 5) & 7);
   3.934 -                tex_read = tex_ARGB1555;
   3.935 +                tex_read = (virge->s3d.cmd_set & CMD_SET_TWE) ? tex_ARGB1555 : tex_ARGB1555_nowrap;
   3.936 +                break;
   3.937          }
   3.938          
   3.939  //        pclog("Triangle %i %i,%i to %i,%i  %08x\n", y, x1 >> 20, y, virge->s3d.txend01 >> 20, y - (virge->s3d.ty01 + virge->s3d.ty12), state.cmd_set);
   3.940 @@ -2372,6 +2509,10 @@
   3.941          tri(virge, &state, virge->s3d.ty12, virge->s3d.TdXdY02, virge->s3d.TdXdY12);
   3.942  
   3.943          virge->tri_count++;
   3.944 +
   3.945 +        end_time = timer_read();
   3.946 +        
   3.947 +        virge_time += end_time - start_time;
   3.948  }
   3.949  
   3.950  
   3.951 @@ -2693,11 +2834,20 @@
   3.952          int cur_len;
   3.953          char temps[256];
   3.954  
   3.955 +        uint64_t new_time = timer_read();
   3.956 +        uint64_t status_diff = new_time - status_time;
   3.957 +        status_time = new_time;
   3.958 +
   3.959 +        if (!status_diff)
   3.960 +                status_diff = 1;
   3.961 +
   3.962          cur_len = svga_add_status_info(s, cur_len, &virge->svga);
   3.963 -        sprintf(temps, "%f Mpixels/sec\n%f ktris/sec\n", (double)virge->pixel_count/1000000.0, (double)virge->tri_count/1000.0);
   3.964 +        sprintf(temps, "%f Mpixels/sec\n%f ktris/sec\n%f%% CPU\n%f%% CPU (real)\n%d writes", (double)virge->pixel_count/1000000.0, (double)virge->tri_count/1000.0, ((double)virge_time * 100.0) / timer_freq, ((double)virge_time * 100.0) / status_diff, reg_writes);
   3.965          strncat(s, temps, cur_len);
   3.966          cur_len -= strlen(temps);
   3.967          virge->pixel_count = virge->tri_count = 0;
   3.968 +        virge_time = 0;
   3.969 +        reg_writes = 0;
   3.970          
   3.971          return max_len - cur_len;
   3.972  }
     4.1 --- a/src/win.c	Mon May 26 18:13:23 2014 +0100
     4.2 +++ b/src/win.c	Wed Jun 04 19:45:12 2014 +0100
     4.3 @@ -31,6 +31,8 @@
     4.4  #include "win-d3d-fs.h"
     4.5  //#include "win-opengl.h"
     4.6  
     4.7 +uint64_t timer_freq;
     4.8 +
     4.9  static struct
    4.10  {
    4.11          void (*init)(HWND h);
    4.12 @@ -264,6 +266,13 @@
    4.13          SetWindowText(ghwnd, s);
    4.14  }
    4.15  
    4.16 +uint64_t timer_read()
    4.17 +{
    4.18 +        LARGE_INTEGER qpc_time;
    4.19 +        QueryPerformanceCounter(&qpc_time);
    4.20 +        return qpc_time.QuadPart;
    4.21 +}
    4.22 +
    4.23  int WINAPI WinMain (HINSTANCE hThisInstance,
    4.24                      HINSTANCE hPrevInstance,
    4.25                      LPSTR lpszArgument,
    4.26 @@ -274,6 +283,7 @@
    4.27          MSG messages;            /* Here messages to the application are saved */
    4.28          WNDCLASSEX wincl;        /* Data structure for the windowclass */
    4.29          int c, d;
    4.30 +        LARGE_INTEGER qpc_freq;
    4.31  
    4.32          hinstance=hThisInstance;
    4.33          /* The Window structure */
    4.34 @@ -436,6 +446,10 @@
    4.35          install_int_ex(vsyncint,BPS_TO_TIMER(100));
    4.36          
    4.37          updatewindowsize(640, 480);
    4.38 +
    4.39 +        QueryPerformanceFrequency(&qpc_freq);
    4.40 +        timer_freq = qpc_freq.QuadPart;
    4.41 +
    4.42  //        focus=1;
    4.43  //        setrefresh(100);
    4.44