14 files changed, 631 insertions, 238 deletions
diff --git a/src/citra_qt/debugger/graphics.cpp b/src/citra_qt/debugger/graphics.cpp
index 6ff4c290d..9633d367e 100644
--- a/src/citra_qt/debugger/graphics.cpp
+++ b/src/citra_qt/debugger/graphics.cpp
@@ -72,7 +72,7 @@ void GPUCommandStreamItemModel::OnGXCommandFinishedInternal(int total_command_co
 
 GPUCommandStreamWidget::GPUCommandStreamWidget(QWidget* parent) : QDockWidget(tr("Graphics Debugger"), parent)
 {
-    // TODO: set objectName!
+    setObjectName("GraphicsDebugger");
 
     GPUCommandStreamItemModel* command_model = new GPUCommandStreamItemModel(this);
     g_debugger.RegisterObserver(command_model);
diff --git a/src/citra_qt/debugger/graphics_framebuffer.cpp b/src/citra_qt/debugger/graphics_framebuffer.cpp
index dd41c3880..a9e9de652 100644
--- a/src/citra_qt/debugger/graphics_framebuffer.cpp
+++ b/src/citra_qt/debugger/graphics_framebuffer.cpp
@@ -10,6 +10,7 @@
 #include <QPushButton>
 #include <QSpinBox>
 
+#include "video_core/color.h"
 #include "video_core/pica.h"
 
 #include "graphics_framebuffer.hxx"
@@ -202,7 +203,8 @@ void GraphicsFramebufferWidget::OnUpdate()
         framebuffer_address = framebuffer.GetColorBufferPhysicalAddress();
         framebuffer_width = framebuffer.GetWidth();
         framebuffer_height = framebuffer.GetHeight();
-        framebuffer_format = static_cast<Format>(framebuffer.color_format);
+        // TODO: It's unknown how this format is actually specified
+        framebuffer_format = Format::RGBA8;
 
         break;
     }
@@ -258,10 +260,10 @@ void GraphicsFramebufferWidget::OnUpdate()
         for (unsigned y = 0; y < framebuffer_height; ++y) {
             for (unsigned x = 0; x < framebuffer_width; ++x) {
                 u16 value = *(u16*)(((u8*)color_buffer) + x * 2 + y * framebuffer_width * 2);
-                u8 r = (value >> 11) & 0x1F;
-                u8 g = (value >> 6) & 0x1F;
-                u8 b = (value >> 1) & 0x1F;
-                u8 a = value & 1;
+                u8 r = Color::Convert5To8((value >> 11) & 0x1F);
+                u8 g = Color::Convert5To8((value >> 6) & 0x1F);
+                u8 b = Color::Convert5To8((value >> 1) & 0x1F);
+                u8 a = Color::Convert1To8(value & 1);
 
                 decoded_image.setPixel(x, y, qRgba(r, g, b, 255/*a*/));
             }
diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
index c61ae0053..f2ea0e9dd 100644
--- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
@@ -2171,29 +2171,45 @@ ARM_INST_PTR INTERPRETER_TRANSLATE(rsc)(unsigned int inst, int index)
     }
     return inst_base;
 }
-ARM_INST_PTR INTERPRETER_TRANSLATE(sadd8)(unsigned int inst, int index)    { UNIMPLEMENTED_INSTRUCTION("SADD8"); }
-ARM_INST_PTR INTERPRETER_TRANSLATE(sadd16)(unsigned int inst, int index)
+ARM_INST_PTR INTERPRETER_TRANSLATE(sadd8)(unsigned int inst, int index)
 {
     arm_inst* const inst_base = (arm_inst*)AllocBuffer(sizeof(arm_inst) + sizeof(generic_arm_inst));
     generic_arm_inst* const inst_cream = (generic_arm_inst*)inst_base->component;
-    
+
     inst_base->cond     = BITS(inst, 28, 31);
     inst_base->idx      = index;
     inst_base->br       = NON_BRANCH;
     inst_base->load_r15 = 0;
-    
+
     inst_cream->Rm  = BITS(inst, 0, 3);
     inst_cream->Rn  = BITS(inst, 16, 19);
     inst_cream->Rd  = BITS(inst, 12, 15);
     inst_cream->op1 = BITS(inst, 20, 21);
     inst_cream->op2 = BITS(inst, 5, 7);
-    
+
     return inst_base;
 }
+ARM_INST_PTR INTERPRETER_TRANSLATE(sadd16)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(sadd8)(inst, index);
+}
 ARM_INST_PTR INTERPRETER_TRANSLATE(saddsubx)(unsigned int inst, int index)
 {
-    return INTERPRETER_TRANSLATE(sadd16)(inst, index);
+    return INTERPRETER_TRANSLATE(sadd8)(inst, index);
 }
+ARM_INST_PTR INTERPRETER_TRANSLATE(ssub8)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(sadd8)(inst, index);
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(ssub16)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(sadd8)(inst, index);
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(ssubaddx)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(sadd8)(inst, index);
+}
+
 ARM_INST_PTR INTERPRETER_TRANSLATE(sbc)(unsigned int inst, int index)
 {
     arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(sbc_inst));
@@ -2236,13 +2252,48 @@ ARM_INST_PTR INTERPRETER_TRANSLATE(sel)(unsigned int inst, int index)
 
     return inst_base;
 }
+
 ARM_INST_PTR INTERPRETER_TRANSLATE(setend)(unsigned int inst, int index)    { UNIMPLEMENTED_INSTRUCTION("SETEND"); }
-ARM_INST_PTR INTERPRETER_TRANSLATE(shadd16)(unsigned int inst, int index)   { UNIMPLEMENTED_INSTRUCTION("SHADD16"); }
-ARM_INST_PTR INTERPRETER_TRANSLATE(shadd8)(unsigned int inst, int index)    { UNIMPLEMENTED_INSTRUCTION("SHADD8"); }
-ARM_INST_PTR INTERPRETER_TRANSLATE(shaddsubx)(unsigned int inst, int index) { UNIMPLEMENTED_INSTRUCTION("SHADDSUBX"); }
-ARM_INST_PTR INTERPRETER_TRANSLATE(shsub16)(unsigned int inst, int index)   { UNIMPLEMENTED_INSTRUCTION("SHSUB16"); }
-ARM_INST_PTR INTERPRETER_TRANSLATE(shsub8)(unsigned int inst, int index)    { UNIMPLEMENTED_INSTRUCTION("SHSUB8"); }
-ARM_INST_PTR INTERPRETER_TRANSLATE(shsubaddx)(unsigned int inst, int index) { UNIMPLEMENTED_INSTRUCTION("SHSUBADDX"); }
+
+ARM_INST_PTR INTERPRETER_TRANSLATE(shadd8)(unsigned int inst, int index)
+{
+    arm_inst* const inst_base = (arm_inst*)AllocBuffer(sizeof(arm_inst) + sizeof(generic_arm_inst));
+    generic_arm_inst* const inst_cream = (generic_arm_inst*)inst_base->component;
+
+    inst_base->cond     = BITS(inst, 28, 31);
+    inst_base->idx      = index;
+    inst_base->br       = NON_BRANCH;
+    inst_base->load_r15 = 0;
+
+    inst_cream->op1 = BITS(inst, 20, 21);
+    inst_cream->op2 = BITS(inst, 5, 7);
+    inst_cream->Rm  = BITS(inst, 0, 3);
+    inst_cream->Rn  = BITS(inst, 16, 19);
+    inst_cream->Rd  = BITS(inst, 12, 15);
+
+    return inst_base;
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(shadd16)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(shadd8)(inst, index);
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(shaddsubx)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(shadd8)(inst, index);
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(shsub8)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(shadd8)(inst, index);
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(shsub16)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(shadd8)(inst, index);
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(shsubaddx)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(shadd8)(inst, index);
+}
+
 ARM_INST_PTR INTERPRETER_TRANSLATE(smla)(unsigned int inst, int index)
 {
     arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(smla_inst));
@@ -2408,15 +2459,7 @@ ARM_INST_PTR INTERPRETER_TRANSLATE(ssat16)(unsigned int inst, int index)
 
     return inst_base;
 }
-ARM_INST_PTR INTERPRETER_TRANSLATE(ssub8)(unsigned int inst, int index)    { UNIMPLEMENTED_INSTRUCTION("SSUB8"); }
-ARM_INST_PTR INTERPRETER_TRANSLATE(ssub16)(unsigned int inst, int index)
-{
-    return INTERPRETER_TRANSLATE(sadd16)(inst, index);
-}
-ARM_INST_PTR INTERPRETER_TRANSLATE(ssubaddx)(unsigned int inst, int index)
-{
-    return INTERPRETER_TRANSLATE(sadd16)(inst, index);
-}
+
 ARM_INST_PTR INTERPRETER_TRANSLATE(stc)(unsigned int inst, int index)
 {
     arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(stc_inst));
@@ -5039,6 +5082,7 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
     }
 
     SADD8_INST:
+    SSUB8_INST:
     SADD16_INST:
     SADDSUBX_INST:
     SSUBADDX_INST:
@@ -5046,52 +5090,96 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
     {
         if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) {
             generic_arm_inst* const inst_cream = (generic_arm_inst*)inst_base->component;
+            const u8 op2 = inst_cream->op2;
 
-            const s16 rn_lo = (RN & 0xFFFF);
-            const s16 rn_hi = ((RN >> 16) & 0xFFFF);
-            const s16 rm_lo = (RM & 0xFFFF);
-            const s16 rm_hi = ((RM >> 16) & 0xFFFF);
+            if (op2 == 0x00 || op2 == 0x01 || op2 == 0x02 || op2 == 0x03) {
+                const s16 rn_lo = (RN & 0xFFFF);
+                const s16 rn_hi = ((RN >> 16) & 0xFFFF);
+                const s16 rm_lo = (RM & 0xFFFF);
+                const s16 rm_hi = ((RM >> 16) & 0xFFFF);
 
-            s32 lo_result = 0;
-            s32 hi_result = 0;
+                s32 lo_result = 0;
+                s32 hi_result = 0;
 
-            // SADD16
-            if (inst_cream->op2 == 0x00) {
-                lo_result = (rn_lo + rm_lo);
-                hi_result = (rn_hi + rm_hi);
-            }
-            // SASX
-            else if (inst_cream->op2 == 0x01) {
-                lo_result = (rn_lo - rm_hi);
-                hi_result = (rn_hi + rm_lo);
-            }
-            // SSAX
-            else if (inst_cream->op2 == 0x02) {
-                lo_result = (rn_lo + rm_hi);
-                hi_result = (rn_hi - rm_lo);
-            }
-            // SSUB16
-            else if (inst_cream->op2 == 0x03) {
-                lo_result = (rn_lo - rm_lo);
-                hi_result = (rn_hi - rm_hi);
-            }
+                // SADD16
+                if (inst_cream->op2 == 0x00) {
+                    lo_result = (rn_lo + rm_lo);
+                    hi_result = (rn_hi + rm_hi);
+                }
+                // SASX
+                else if (op2 == 0x01) {
+                    lo_result = (rn_lo - rm_hi);
+                    hi_result = (rn_hi + rm_lo);
+                }
+                // SSAX
+                else if (op2 == 0x02) {
+                    lo_result = (rn_lo + rm_hi);
+                    hi_result = (rn_hi - rm_lo);
+                }
+                // SSUB16
+                else if (op2 == 0x03) {
+                    lo_result = (rn_lo - rm_lo);
+                    hi_result = (rn_hi - rm_hi);
+                }
 
-            RD = (lo_result & 0xFFFF) | ((hi_result & 0xFFFF) << 16);
+                RD = (lo_result & 0xFFFF) | ((hi_result & 0xFFFF) << 16);
 
-            if (lo_result >= 0) {
-                cpu->Cpsr |= (1 << 16);
-                cpu->Cpsr |= (1 << 17);
-            } else {
-                cpu->Cpsr &= ~(1 << 16);
-                cpu->Cpsr &= ~(1 << 17);
+                if (lo_result >= 0) {
+                    cpu->Cpsr |= (1 << 16);
+                    cpu->Cpsr |= (1 << 17);
+                } else {
+                    cpu->Cpsr &= ~(1 << 16);
+                    cpu->Cpsr &= ~(1 << 17);
+                }
+
+                if (hi_result >= 0) {
+                    cpu->Cpsr |= (1 << 18);
+                    cpu->Cpsr |= (1 << 19);
+                } else {
+                    cpu->Cpsr &= ~(1 << 18);
+                    cpu->Cpsr &= ~(1 << 19);
+                }
             }
+            else if (op2 == 0x04 || op2 == 0x07) {
+                s32 lo_val1, lo_val2;
+                s32 hi_val1, hi_val2;
 
-            if (hi_result >= 0) {
-                cpu->Cpsr |= (1 << 18);
-                cpu->Cpsr |= (1 << 19);
-            } else {
-                cpu->Cpsr &= ~(1 << 18);
-                cpu->Cpsr &= ~(1 << 19);
+                // SADD8
+                if (op2 == 0x04) {
+                    lo_val1 = (s32)(s8)(RN & 0xFF) + (s32)(s8)(RM & 0xFF);
+                    lo_val2 = (s32)(s8)((RN >> 8) & 0xFF)  + (s32)(s8)((RM >> 8) & 0xFF);
+                    hi_val1 = (s32)(s8)((RN >> 16) & 0xFF) + (s32)(s8)((RM >> 16) & 0xFF);
+                    hi_val2 = (s32)(s8)((RN >> 24) & 0xFF) + (s32)(s8)((RM >> 24) & 0xFF);
+                }
+                // SSUB8
+                else {
+                    lo_val1 = (s32)(s8)(RN & 0xFF) - (s32)(s8)(RM & 0xFF);
+                    lo_val2 = (s32)(s8)((RN >> 8) & 0xFF) - (s32)(s8)((RM >> 8) & 0xFF);
+                    hi_val1 = (s32)(s8)((RN >> 16) & 0xFF) - (s32)(s8)((RM >> 16) & 0xFF);
+                    hi_val2 = (s32)(s8)((RN >> 24) & 0xFF) - (s32)(s8)((RM >> 24) & 0xFF);
+                }
+
+                RD =  ((lo_val1 & 0xFF) | ((lo_val2 & 0xFF) << 8) | ((hi_val1 & 0xFF) << 16) | ((hi_val2 & 0xFF) << 24));
+
+                if (lo_val1 >= 0)
+                    cpu->Cpsr |= (1 << 16);
+                else
+                    cpu->Cpsr &= ~(1 << 16);
+
+                if (lo_val2 >= 0)
+                    cpu->Cpsr |= (1 << 17);
+                else
+                    cpu->Cpsr &= ~(1 << 17);
+
+                if (hi_val1 >= 0)
+                    cpu->Cpsr |= (1 << 18);
+                else
+                    cpu->Cpsr &= ~(1 << 18);
+
+                if (hi_val2 >= 0)
+                    cpu->Cpsr |= (1 << 19);
+                else
+                    cpu->Cpsr &= ~(1 << 19);
             }
         }
 
@@ -5176,12 +5264,79 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
     }
 
     SETEND_INST:
-    SHADD16_INST:
+
     SHADD8_INST:
+    SHADD16_INST:
     SHADDSUBX_INST:
-    SHSUB16_INST:
     SHSUB8_INST:
+    SHSUB16_INST:
     SHSUBADDX_INST:
+    {
+        if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) {
+            generic_arm_inst* const inst_cream = (generic_arm_inst*)inst_base->component;
+
+            const u8 op2 = inst_cream->op2;
+            const u32 rm_val = RM;
+            const u32 rn_val = RN;
+
+            if (op2 == 0x00 || op2 == 0x01 || op2 == 0x02 || op2 == 0x03) {
+                s32 lo_result = 0;
+                s32 hi_result = 0;
+
+                // SHADD16
+                if (op2 == 0x00) {
+                    lo_result = ((s16)(rn_val & 0xFFFF) + (s16)(rm_val & 0xFFFF)) >> 1;
+                    hi_result = ((s16)((rn_val >> 16) & 0xFFFF) + (s16)((rm_val >> 16) & 0xFFFF)) >> 1;
+                }
+                // SHASX
+                else if (op2 == 0x01) {
+                    lo_result = ((s16)(rn_val & 0xFFFF) - (s16)((rm_val >> 16) & 0xFFFF)) >> 1;
+                    hi_result = ((s16)((rn_val >> 16) & 0xFFFF) + (s16)(rm_val & 0xFFFF)) >> 1;
+                }
+                // SHSAX
+                else if (op2 == 0x02) {
+                    lo_result = ((s16)(rn_val & 0xFFFF) + (s16)((rm_val >> 16) & 0xFFFF)) >> 1;
+                    hi_result = ((s16)((rn_val >> 16) & 0xFFFF) - (s16)(rm_val & 0xFFFF)) >> 1;
+                }
+                // SHSUB16
+                else if (op2 == 0x03) {
+                    lo_result = ((s16)(rn_val & 0xFFFF) - (s16)(rm_val & 0xFFFF)) >> 1;
+                    hi_result = ((s16)((rn_val >> 16) & 0xFFFF) - (s16)((rm_val >> 16) & 0xFFFF)) >> 1;
+                }
+
+                RD = ((lo_result & 0xFFFF) | ((hi_result & 0xFFFF) << 16));
+            }
+            else if (op2 == 0x04 || op2 == 0x07) {
+                s16 lo_val1, lo_val2;
+                s16 hi_val1, hi_val2;
+
+                // SHADD8
+                if (op2 == 0x04) {
+                    lo_val1 = ((s8)(rn_val & 0xFF) + (s8)(rm_val & 0xFF)) >> 1;
+                    lo_val2 = ((s8)((rn_val >> 8) & 0xFF) + (s8)((rm_val >> 8) & 0xFF)) >> 1;
+
+                    hi_val1 = ((s8)((rn_val >> 16) & 0xFF) + (s8)((rm_val >> 16) & 0xFF)) >> 1;
+                    hi_val2 = ((s8)((rn_val >> 24) & 0xFF) + (s8)((rm_val >> 24) & 0xFF)) >> 1;
+                }
+                // SHSUB8
+                else {
+                    lo_val1 = ((s8)(rn_val & 0xFF) - (s8)(rm_val & 0xFF)) >> 1;
+                    lo_val2 = ((s8)((rn_val >> 8) & 0xFF) - (s8)((rm_val >> 8) & 0xFF)) >> 1;
+
+                    hi_val1 = ((s8)((rn_val >> 16) & 0xFF) - (s8)((rm_val >> 16) & 0xFF)) >> 1;
+                    hi_val2 = ((s8)((rn_val >> 24) & 0xFF) - (s8)((rm_val >> 24) & 0xFF)) >> 1;
+                }
+
+                RD = (lo_val1 & 0xFF) | ((lo_val2 & 0xFF) << 8) | ((hi_val1 & 0xFF) << 16) | ((hi_val2 & 0xFF) << 24);
+            }
+        }
+
+        cpu->Reg[15] += GET_INST_SIZE(cpu);
+        INC_PC(sizeof(generic_arm_inst));
+        FETCH_INST;
+        GOTO_NEXT_INST;
+    }
+
     SMLA_INST:
     {
         if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) {
@@ -5407,7 +5562,7 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
         FETCH_INST;
         GOTO_NEXT_INST;
     }
-    SSUB8_INST:
+
     STC_INST:
     {
         // Instruction not implemented
diff --git a/src/core/arm/interpreter/armemu.cpp b/src/core/arm/interpreter/armemu.cpp
index b9c2aa6c2..43b1ba40e 100644
--- a/src/core/arm/interpreter/armemu.cpp
+++ b/src/core/arm/interpreter/armemu.cpp
@@ -5881,67 +5881,45 @@ L_stm_s_takeabort:
                 const u32 rm_val = state->Reg[rm_idx];
                 const u32 rn_val = state->Reg[rn_idx];
 
-                u8 lo_val1;
-                u8 lo_val2;
-                u8 hi_val1;
-                u8 hi_val2;
+                s32 lo_val1, lo_val2;
+                s32 hi_val1, hi_val2;
 
                 // SADD8
                 if ((instr & 0xFF0) == 0xf90) {
-                    lo_val1 = (u8)((rn_val & 0xFF) + (rm_val & 0xFF));
-                    lo_val2 = (u8)(((rn_val >> 8) & 0xFF) + ((rm_val >> 8) & 0xFF));
-                    hi_val1 = (u8)(((rn_val >> 16) & 0xFF) + ((rm_val >> 16) & 0xFF));
-                    hi_val2 = (u8)(((rn_val >> 24) & 0xFF) + ((rm_val >> 24) & 0xFF));
-
-                    if (lo_val1 & 0x80)
-                        state->GEFlag |= (1 << 16);
-                    else
-                        state->GEFlag &= ~(1 << 16);
-
-                    if (lo_val2 & 0x80)
-                        state->GEFlag |= (1 << 17);
-                    else
-                        state->GEFlag &= ~(1 << 17);
-
-                    if (hi_val1 & 0x80)
-                        state->GEFlag |= (1 << 18);
-                    else
-                        state->GEFlag &= ~(1 << 18);
-
-                    if (hi_val2 & 0x80)
-                        state->GEFlag |= (1 << 19);
-                    else
-                        state->GEFlag &= ~(1 << 19);
+                    lo_val1 = (s32)(s8)(rn_val & 0xFF) + (s32)(s8)(rm_val & 0xFF);
+                    lo_val2 = (s32)(s8)((rn_val >> 8) & 0xFF)  + (s32)(s8)((rm_val >> 8) & 0xFF);
+                    hi_val1 = (s32)(s8)((rn_val >> 16) & 0xFF) + (s32)(s8)((rm_val >> 16) & 0xFF);
+                    hi_val2 = (s32)(s8)((rn_val >> 24) & 0xFF) + (s32)(s8)((rm_val >> 24) & 0xFF);
                 }
                 // SSUB8
                 else {
-                    lo_val1 = (u8)((rn_val & 0xFF) - (rm_val & 0xFF));
-                    lo_val2 = (u8)(((rn_val >> 8) & 0xFF) - ((rm_val >> 8) & 0xFF));
-                    hi_val1 = (u8)(((rn_val >> 16) & 0xFF) - ((rm_val >> 16) & 0xFF));
-                    hi_val2 = (u8)(((rn_val >> 24) & 0xFF) - ((rm_val >> 24) & 0xFF));
+                    lo_val1 = (s32)(s8)(rn_val & 0xFF) - (s32)(s8)(rm_val & 0xFF);
+                    lo_val2 = (s32)(s8)((rn_val >> 8) & 0xFF)  - (s32)(s8)((rm_val >> 8) & 0xFF);
+                    hi_val1 = (s32)(s8)((rn_val >> 16) & 0xFF) - (s32)(s8)((rm_val >> 16) & 0xFF);
+                    hi_val2 = (s32)(s8)((rn_val >> 24) & 0xFF) - (s32)(s8)((rm_val >> 24) & 0xFF);
+                }
 
-                    if (!(lo_val1 & 0x80))
-                        state->GEFlag |= (1 << 16);
-                    else
-                        state->GEFlag &= ~(1 << 16);
+                if (lo_val1 >= 0)
+                    state->GEFlag |= (1 << 16);
+                else
+                    state->GEFlag &= ~(1 << 16);
 
-                    if (!(lo_val2 & 0x80))
-                        state->GEFlag |= (1 << 17);
-                    else
-                        state->GEFlag &= ~(1 << 17);
+                if (lo_val2 >= 0)
+                    state->GEFlag |= (1 << 17);
+                else
+                    state->GEFlag &= ~(1 << 17);
 
-                    if (!(hi_val1 & 0x80))
-                        state->GEFlag |= (1 << 18);
-                    else
-                        state->GEFlag &= ~(1 << 18);
+                if (hi_val1 >= 0)
+                    state->GEFlag |= (1 << 18);
+                else
+                    state->GEFlag &= ~(1 << 18);
 
-                    if (!(hi_val2 & 0x80))
-                        state->GEFlag |= (1 << 19);
-                    else
-                        state->GEFlag &= ~(1 << 19);
-                }
+                if (hi_val2 >= 0)
+                    state->GEFlag |= (1 << 19);
+                else
+                    state->GEFlag &= ~(1 << 19);
 
-                state->Reg[rd_idx] = (lo_val1 | lo_val2 << 8 | hi_val1 << 16 | hi_val2 << 24);
+                state->Reg[rd_idx] = ((lo_val1 & 0xFF) | ((lo_val2 & 0xFF) << 8) | ((hi_val1 & 0xFF) << 16) | ((hi_val2 & 0xFF) << 24));
                 return 1;
             }
             else {
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index dd619cb16..0ff6c6cde 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -94,11 +94,15 @@ inline void Write(u32 addr, const T data) {
                         int r, g, b, a;
                     } source_color = { 0, 0, 0, 0 };
 
+                    // Cheap emulation of horizontal scaling: Just skip each second pixel of the
+                    // input framebuffer. We keep track of this in the pixel_skip variable.
+                    unsigned pixel_skip = (config.scale_horizontally != 0) ? 2 : 1;
+
                     switch (config.input_format) {
                     case Regs::PixelFormat::RGBA8:
                     {
                         // TODO: Most likely got the component order messed up.
-                        u8* srcptr = source_pointer + x * 4 + y * config.input_width * 4;
+                        u8* srcptr = source_pointer + x * 4 * pixel_skip + y * config.input_width * 4 * pixel_skip;
                         source_color.r = srcptr[0]; // blue
                         source_color.g = srcptr[1]; // green
                         source_color.b = srcptr[2]; // red
diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
index 292f496c1..7de055232 100644
--- a/src/core/hw/gpu.h
+++ b/src/core/hw/gpu.h
@@ -157,6 +157,9 @@ struct Regs {
             BitField< 8, 3, PixelFormat> input_format;
             BitField<12, 3, PixelFormat> output_format;
             BitField<16, 1, u32> output_tiled;     // stores output in a tiled format
+
+            // TODO: Not really sure if this actually scales, or even resizes at all.
+            BitField<24, 1, u32> scale_horizontally;
         };
 
         INSERT_PADDING_WORDS(0x1);
diff --git a/src/video_core/color.h b/src/video_core/color.h
new file mode 100644
index 000000000..e86ac1265
--- /dev/null
+++ b/src/video_core/color.h
@@ -0,0 +1,32 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Color {
+
+/// Convert a 1-bit color component to 8 bit
+static inline u8 Convert1To8(u8 value) {
+    return value * 255;
+}
+
+/// Convert a 4-bit color component to 8 bit
+static inline u8 Convert4To8(u8 value) {
+    return (value << 4) | value;
+}
+
+/// Convert a 5-bit color component to 8 bit
+static inline u8 Convert5To8(u8 value) {
+    return (value << 3) | (value >> 2);
+}
+
+/// Convert a 6-bit color component to 8 bit
+static inline u8 Convert6To8(u8 value) {
+    return (value << 2) | (value >> 4);
+}
+
+
+} // namespace
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 9602779f4..0d9f4ba66 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -112,6 +112,11 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 // Initialize data for the current vertex
                 VertexShader::InputVertex input;
 
+                // Load a debugging token to check whether this gets loaded by the running
+                // application or not.
+                static const float24 debug_token = float24::FromRawFloat24(0x00abcdef);
+                input.attr[0].w = debug_token;
+
                 for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
                     for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
                         const u8* srcdata = Memory::GetPointer(PAddrToVAddr(vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]));
@@ -136,6 +141,16 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                     }
                 }
 
+                // HACK: Some games do not initialize the vertex position's w component. This leads
+                //       to critical issues since it messes up perspective division. As a
+                //       workaround, we force the fourth component to 1.0 if we find this to be the
+                //       case.
+                //       To do this, we additionally have to assume that the first input attribute
+                //       is the vertex position, since there's no information about this other than
+                //       the empiric observation that this is usually the case.
+                if (input.attr[0].w == debug_token)
+                    input.attr[0].w = float24::FromFloat32(1.0);
+
                 if (g_debug_context)
                     g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
 
@@ -173,6 +188,19 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
             break;
 
+        case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[0], 0x2b1):
+        case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[1], 0x2b2):
+        case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[2], 0x2b3):
+        case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[3], 0x2b4):
+        {
+            int index = (id - PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[0], 0x2b1));
+            auto values = registers.vs_int_uniforms[index];
+            VertexShader::GetIntUniform(index) = Math::Vec4<u8>(values.x, values.y, values.z, values.w);
+            LOG_TRACE(HW_GPU, "Set integer uniform %d to %02x %02x %02x %02x",
+                      index, values.x.Value(), values.y.Value(), values.z.Value(), values.w.Value());
+            break;
+        }
+
         case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[0], 0x2c1):
         case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[1], 0x2c2):
         case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[2], 0x2c3):
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 5921185a6..a494465b9 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -19,6 +19,7 @@
 #include "common/log.h"
 #include "common/file_util.h"
 
+#include "video_core/color.h"
 #include "video_core/math.h"
 #include "video_core/pica.h"
 
@@ -359,29 +360,26 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
         u8 g = ((source_ptr) >> 6) & 0x1F;
         u8 b = (source_ptr >> 1) & 0x1F;
         u8 a = source_ptr & 1;
-        return Math::MakeVec<u8>((r << 3) | (r >> 2), (g << 3) | (g >> 2), (b << 3) | (b >> 2), disable_alpha ? 255 : (a * 255));
+        return Math::MakeVec<u8>(Color::Convert5To8(r), Color::Convert5To8(g),
+                                 Color::Convert5To8(b), disable_alpha ? 255 : Color::Convert1To8(a));
     }
 
     case Regs::TextureFormat::RGB565:
     {
         const u16 source_ptr = *(const u16*)(source + offset * 2);
-        u8 r = (source_ptr >> 11) & 0x1F;
-        u8 g = ((source_ptr) >> 5) & 0x3F;
-        u8 b = (source_ptr) & 0x1F;
-        return Math::MakeVec<u8>((r << 3) | (r >> 2), (g << 2) | (g >> 4), (b << 3) | (b >> 2), 255);
+        u8 r = Color::Convert5To8((source_ptr >> 11) & 0x1F);
+        u8 g = Color::Convert6To8(((source_ptr) >> 5) & 0x3F);
+        u8 b = Color::Convert5To8((source_ptr) & 0x1F);
+        return Math::MakeVec<u8>(r, g, b, 255);
     }
 
     case Regs::TextureFormat::RGBA4:
     {
         const u8* source_ptr = source + offset * 2;
-        u8 r = source_ptr[1] >> 4;
-        u8 g = source_ptr[1] & 0xFF;
-        u8 b = source_ptr[0] >> 4;
-        u8 a = source_ptr[0] & 0xFF;
-        r = (r << 4) | r;
-        g = (g << 4) | g;
-        b = (b << 4) | b;
-        a = (a << 4) | a;
+        u8 r = Color::Convert4To8(source_ptr[1] >> 4);
+        u8 g = Color::Convert4To8(source_ptr[1] & 0xF);
+        u8 b = Color::Convert4To8(source_ptr[0] >> 4);
+        u8 a = Color::Convert4To8(source_ptr[0] & 0xF);
         return { r, g, b, disable_alpha ? (u8)255 : a };
     }
 
@@ -389,13 +387,11 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
     {
         const u8* source_ptr = source + offset * 2;
 
-        // TODO: component order not verified
-
         if (disable_alpha) {
             // Show intensity as red, alpha as green
-            return { source_ptr[0], source_ptr[1], 0, 255 };
+            return { source_ptr[1], source_ptr[0], 0, 255 };
         } else {
-            return { source_ptr[0], source_ptr[0], source_ptr[0], source_ptr[1]};
+            return { source_ptr[1], source_ptr[1], source_ptr[1], source_ptr[0]};
         }
     }
 
@@ -418,14 +414,10 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
 
     case Regs::TextureFormat::IA4:
     {
-        const u8* source_ptr = source + offset / 2;
-
-        // TODO: component order not verified
+        const u8* source_ptr = source + offset;
 
-        u8 i = (*source_ptr) & 0xF;
-        u8 a = ((*source_ptr) & 0xF0) >> 4;
-        a |= a << 4;
-        i |= i << 4;
+        u8 i = Color::Convert4To8(((*source_ptr) & 0xF0) >> 4);
+        u8 a = Color::Convert4To8((*source_ptr) & 0xF);
 
         if (disable_alpha) {
             // Show intensity as red, alpha as green
@@ -439,15 +431,13 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
     {
         const u8* source_ptr = source + offset / 2;
 
-        // TODO: component order not verified
-
         u8 a = (coarse_x % 2) ? ((*source_ptr)&0xF) : (((*source_ptr) & 0xF0) >> 4);
-        a |= a << 4;
+        a = Color::Convert4To8(a);
 
         if (disable_alpha) {
-            return { *source_ptr, *source_ptr, *source_ptr, 255 };
+            return { a, a, a, 255 };
         } else {
-            return { 0, 0, 0, *source_ptr };
+            return { 0, 0, 0, a };
         }
     }
 
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 38bac748c..f5771ed84 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -50,7 +50,19 @@ struct Regs {
 
     u32 trigger_irq;
 
-    INSERT_PADDING_WORDS(0x30);
+    INSERT_PADDING_WORDS(0x2f);
+
+    enum class CullMode : u32 {
+        // Select which polygons are considered to be "frontfacing".
+        KeepAll              = 0,
+        KeepClockWise        = 1,
+        KeepCounterClockWise = 2,
+        // TODO: What does the third value imply?
+    };
+
+    union {
+        BitField<0, 2, CullMode> cull_mode;
+    };
 
     BitField<0, 24, u32> viewport_size_x;
 
@@ -289,7 +301,7 @@ struct Regs {
     TevStageConfig tev_stage4;
     INSERT_PADDING_WORDS(0x3);
     TevStageConfig tev_stage5;
-    INSERT_PADDING_WORDS(0x13);
+    INSERT_PADDING_WORDS(0x3);
 
     const std::array<Regs::TevStageConfig,6> GetTevStages() const {
         return { tev_stage0, tev_stage1,
@@ -298,6 +310,60 @@ struct Regs {
     };
 
     struct {
+        enum DepthFunc : u32 {
+            Always      = 1,
+            LessThan    = 4,
+            GreaterThan = 6,
+        };
+
+        union {
+            // If false, logic blending is used
+            BitField<8, 1, u32> alphablend_enable;
+        };
+
+        union {
+            enum BlendEquation : u32 {
+                Add = 0,
+            };
+
+            enum BlendFactor : u32 {
+                Zero = 0,
+                One = 1,
+
+                SourceAlpha = 6,
+                OneMinusSourceAlpha = 7,
+            };
+
+            BitField< 0, 8, BlendEquation> blend_equation_rgb;
+            BitField< 8, 8, BlendEquation> blend_equation_a;
+
+            BitField<16, 4, BlendFactor> factor_source_rgb;
+            BitField<20, 4, BlendFactor> factor_dest_rgb;
+
+            BitField<24, 4, BlendFactor> factor_source_a;
+            BitField<28, 4, BlendFactor> factor_dest_a;
+        } alpha_blending;
+
+        union {
+            enum Op {
+                Set = 4,
+            };
+
+            BitField<0, 4, Op> op;
+        } logic_op;
+
+        INSERT_PADDING_WORDS(0x4);
+
+        union {
+            BitField< 0, 1, u32> depth_test_enable;
+            BitField< 4, 3, DepthFunc> depth_test_func;
+            BitField<12, 1, u32> depth_write_enable;
+        };
+
+        INSERT_PADDING_WORDS(0x8);
+    } output_merger;
+
+    struct {
         enum ColorFormat : u32 {
             RGBA8    = 0,
             RGB8     = 1,
@@ -495,8 +561,14 @@ struct Regs {
     INSERT_PADDING_WORDS(0x51);
 
     BitField<0, 16, u32> vs_bool_uniforms;
+    union {
+        BitField< 0, 8, u32> x;
+        BitField< 8, 8, u32> y;
+        BitField<16, 8, u32> z;
+        BitField<24, 8, u32> w;
+    } vs_int_uniforms[4];
 
-    INSERT_PADDING_WORDS(0x9);
+    INSERT_PADDING_WORDS(0x5);
 
     // Offset to shader program entry point (in words)
     BitField<0, 16, u32> vs_main_offset;
@@ -599,6 +671,7 @@ struct Regs {
             } while(false)
 
         ADD_FIELD(trigger_irq);
+        ADD_FIELD(cull_mode);
         ADD_FIELD(viewport_size_x);
         ADD_FIELD(viewport_size_y);
         ADD_FIELD(viewport_depth_range);
@@ -617,6 +690,7 @@ struct Regs {
         ADD_FIELD(tev_stage3);
         ADD_FIELD(tev_stage4);
         ADD_FIELD(tev_stage5);
+        ADD_FIELD(output_merger);
         ADD_FIELD(framebuffer);
         ADD_FIELD(vertex_attributes);
         ADD_FIELD(index_array);
@@ -625,6 +699,7 @@ struct Regs {
         ADD_FIELD(trigger_draw_indexed);
         ADD_FIELD(triangle_topology);
         ADD_FIELD(vs_bool_uniforms);
+        ADD_FIELD(vs_int_uniforms);
         ADD_FIELD(vs_main_offset);
         ADD_FIELD(vs_input_register_map);
         ADD_FIELD(vs_uniform_setup);
@@ -668,6 +743,7 @@ private:
 #define ASSERT_REG_POSITION(field_name, position) static_assert(offsetof(Regs, field_name) == position * 4, "Field "#field_name" has invalid position")
 
 ASSERT_REG_POSITION(trigger_irq, 0x10);
+ASSERT_REG_POSITION(cull_mode, 0x40);
 ASSERT_REG_POSITION(viewport_size_x, 0x41);
 ASSERT_REG_POSITION(viewport_size_y, 0x43);
 ASSERT_REG_POSITION(viewport_depth_range, 0x4d);
@@ -688,6 +764,7 @@ ASSERT_REG_POSITION(tev_stage2, 0xd0);
 ASSERT_REG_POSITION(tev_stage3, 0xd8);
 ASSERT_REG_POSITION(tev_stage4, 0xf0);
 ASSERT_REG_POSITION(tev_stage5, 0xf8);
+ASSERT_REG_POSITION(output_merger, 0x100);
 ASSERT_REG_POSITION(framebuffer, 0x110);
 ASSERT_REG_POSITION(vertex_attributes, 0x200);
 ASSERT_REG_POSITION(index_array, 0x227);
@@ -696,6 +773,7 @@ ASSERT_REG_POSITION(trigger_draw, 0x22e);
 ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
 ASSERT_REG_POSITION(triangle_topology, 0x25e);
 ASSERT_REG_POSITION(vs_bool_uniforms, 0x2b0);
+ASSERT_REG_POSITION(vs_int_uniforms, 0x2b1);
 ASSERT_REG_POSITION(vs_main_offset, 0x2ba);
 ASSERT_REG_POSITION(vs_input_register_map, 0x2bb);
 ASSERT_REG_POSITION(vs_uniform_setup, 0x2c0);
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index a80148872..025d4e484 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -18,51 +18,82 @@ namespace Pica {
 namespace Rasterizer {
 
 static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
-    u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(registers.framebuffer.GetColorBufferPhysicalAddress())));
+    const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
+    u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
     u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
 
     // Assuming RGBA8 format until actual framebuffer format handling is implemented
     *(color_buffer + x + y * registers.framebuffer.GetWidth()) = value;
 }
 
+static const Math::Vec4<u8> GetPixel(int x, int y) {
+    const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
+    u32* color_buffer_u32 = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(addr)));
+
+    u32 value = *(color_buffer_u32 + x + y * registers.framebuffer.GetWidth());
+    Math::Vec4<u8> ret;
+    ret.a() = value >> 24;
+    ret.r() = (value >> 16) & 0xFF;
+    ret.g() = (value >> 8) & 0xFF;
+    ret.b() = value & 0xFF;
+    return ret;
+ }
+
 static u32 GetDepth(int x, int y) {
-    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(registers.framebuffer.GetDepthBufferPhysicalAddress())));
+    const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
+    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
 
     // Assuming 16-bit depth buffer format until actual format handling is implemented
     return *(depth_buffer + x + y * registers.framebuffer.GetWidth());
 }
 
 static void SetDepth(int x, int y, u16 value) {
-    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(registers.framebuffer.GetDepthBufferPhysicalAddress())));
+    const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
+    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
 
     // Assuming 16-bit depth buffer format until actual format handling is implemented
     *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;
 }
 
-void ProcessTriangle(const VertexShader::OutputVertex& v0,
-                     const VertexShader::OutputVertex& v1,
-                     const VertexShader::OutputVertex& v2)
-{
-    // NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
-    struct Fix12P4 {
-        Fix12P4() {}
-        Fix12P4(u16 val) : val(val) {}
+// NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
+struct Fix12P4 {
+    Fix12P4() {}
+    Fix12P4(u16 val) : val(val) {}
 
-        static u16 FracMask() { return 0xF; }
-        static u16 IntMask() { return (u16)~0xF; }
+    static u16 FracMask() { return 0xF; }
+    static u16 IntMask() { return (u16)~0xF; }
 
-        operator u16() const {
-            return val;
-        }
+    operator u16() const {
+        return val;
+    }
 
-        bool operator < (const Fix12P4& oth) const {
-            return (u16)*this < (u16)oth;
-        }
+    bool operator < (const Fix12P4& oth) const {
+        return (u16)*this < (u16)oth;
+    }
 
-    private:
-        u16 val;
-    };
+private:
+    u16 val;
+};
+
+/**
+ * Calculate signed area of the triangle spanned by the three argument vertices.
+ * The sign denotes an orientation.
+ *
+ * @todo define orientation concretely.
+ */
+static int SignedArea (const Math::Vec2<Fix12P4>& vtx1,
+                       const Math::Vec2<Fix12P4>& vtx2,
+                       const Math::Vec2<Fix12P4>& vtx3) {
+    const auto vec1 = Math::MakeVec(vtx2 - vtx1, 0);
+    const auto vec2 = Math::MakeVec(vtx3 - vtx1, 0);
+    // TODO: There is a very small chance this will overflow for sizeof(int) == 4
+    return Math::Cross(vec1, vec2).z;
+};
 
+void ProcessTriangle(const VertexShader::OutputVertex& v0,
+                     const VertexShader::OutputVertex& v1,
+                     const VertexShader::OutputVertex& v2)
+{
     // vertex positions in rasterizer coordinates
     auto FloatToFix = [](float24 flt) {
                           return Fix12P4(static_cast<unsigned short>(flt.ToFloat32() * 16.0f));
@@ -70,10 +101,23 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
     auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) {
                                              return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
                                          };
+
     Math::Vec3<Fix12P4> vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos),
                                    ScreenToRasterizerCoordinates(v1.screenpos),
                                    ScreenToRasterizerCoordinates(v2.screenpos) };
 
+    if (registers.cull_mode == Regs::CullMode::KeepClockWise) {
+        // Reverse vertex order and use the CCW code path.
+        std::swap(vtxpos[1], vtxpos[2]);
+    }
+
+    if (registers.cull_mode != Regs::CullMode::KeepAll) {
+        // Cull away triangles which are wound clockwise.
+        // TODO: A check for degenerate triangles ("== 0") should be considered for CullMode::KeepAll
+        if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0)
+            return;
+    }
+
     // TODO: Proper scissor rect test!
     u16 min_x = std::min({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x});
     u16 min_y = std::min({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y});
@@ -116,18 +160,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
         for (u16 x = min_x; x < max_x; x += 0x10) {
 
             // Calculate the barycentric coordinates w0, w1 and w2
-            auto orient2d = [](const Math::Vec2<Fix12P4>& vtx1,
-                               const Math::Vec2<Fix12P4>& vtx2,
-                               const Math::Vec2<Fix12P4>& vtx3) {
-                const auto vec1 = Math::MakeVec(vtx2 - vtx1, 0);
-                const auto vec2 = Math::MakeVec(vtx3 - vtx1, 0);
-                // TODO: There is a very small chance this will overflow for sizeof(int) == 4
-                return Math::Cross(vec1, vec2).z;
-            };
-
-            int w0 = bias0 + orient2d(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
-            int w1 = bias1 + orient2d(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
-            int w2 = bias2 + orient2d(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
+            int w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
+            int w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
+            int w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
             int wsum = w0 + w1 + w2;
 
             // If current pixel is not covered by the current primitive
@@ -201,8 +236,8 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                             return 0;
                     }
                 };
-                s = GetWrappedTexCoord(registers.texture0.wrap_s, s, registers.texture0.width);
-                t = GetWrappedTexCoord(registers.texture0.wrap_t, t, registers.texture0.height);
+                s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
+                t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
 
                 u8* texture_data = Memory::GetPointer(PAddrToVAddr(texture.config.GetPhysicalAddress()));
                 auto info = DebugUtils::TextureInfo::FromPicaRegister(texture.config, texture.format);
@@ -279,12 +314,15 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     }
                 };
 
-                auto GetColorModifier = [](ColorModifier factor, const Math::Vec4<u8>& values) -> Math::Vec3<u8> {
+                static auto GetColorModifier = [](ColorModifier factor, const Math::Vec4<u8>& values) -> Math::Vec3<u8> {
                     switch (factor)
                     {
                     case ColorModifier::SourceColor:
                         return values.rgb();
 
+                    case ColorModifier::OneMinusSourceColor:
+                        return (Math::Vec3<u8>(255, 255, 255) - values.rgb()).Cast<u8>();
+
                     case ColorModifier::SourceAlpha:
                         return { values.a(), values.a(), values.a() };
 
@@ -295,7 +333,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     }
                 };
 
-                auto GetAlphaModifier = [](AlphaModifier factor, u8 value) -> u8 {
+                static auto GetAlphaModifier = [](AlphaModifier factor, u8 value) -> u8 {
                     switch (factor) {
                     case AlphaModifier::SourceAlpha:
                         return value;
@@ -310,7 +348,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     }
                 };
 
-                auto ColorCombine = [](Operation op, const Math::Vec3<u8> input[3]) -> Math::Vec3<u8> {
+                static auto ColorCombine = [](Operation op, const Math::Vec3<u8> input[3]) -> Math::Vec3<u8> {
                     switch (op) {
                     case Operation::Replace:
                         return input[0];
@@ -330,6 +368,15 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     case Operation::Lerp:
                         return ((input[0] * input[2] + input[1] * (Math::MakeVec<u8>(255, 255, 255) - input[2]).Cast<u8>()) / 255).Cast<u8>();
 
+                    case Operation::Subtract:
+                    {
+                        auto result = input[0].Cast<int>() - input[1].Cast<int>();
+                        result.r() = std::max(0, result.r());
+                        result.g() = std::max(0, result.g());
+                        result.b() = std::max(0, result.b());
+                        return result.Cast<u8>();
+                    }
+
                     default:
                         LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
                         _dbg_assert_(HW_GPU, 0);
@@ -337,7 +384,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     }
                 };
 
-                auto AlphaCombine = [](Operation op, const std::array<u8,3>& input) -> u8 {
+                static auto AlphaCombine = [](Operation op, const std::array<u8,3>& input) -> u8 {
                     switch (op) {
                     case Operation::Replace:
                         return input[0];
@@ -351,6 +398,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     case Operation::Lerp:
                         return (input[0] * input[2] + input[1] * (255 - input[2])) / 255;
 
+                    case Operation::Subtract:
+                        return std::max(0, (int)input[0] - (int)input[1]);
+
                     default:
                         LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op);
                         _dbg_assert_(HW_GPU, 0);
@@ -381,12 +431,111 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                 combiner_output = Math::MakeVec(color_output, alpha_output);
             }
 
-            // TODO: Not sure if the multiplication by 65535 has already been taken care
-            // of when transforming to screen coordinates or not.
-            u16 z = (u16)(((float)v0.screenpos[2].ToFloat32() * w0 +
-                           (float)v1.screenpos[2].ToFloat32() * w1 +
-                           (float)v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
-            SetDepth(x >> 4, y >> 4, z);
+            // TODO: Does depth indeed only get written even if depth testing is enabled?
+            if (registers.output_merger.depth_test_enable) {
+                u16 z = (u16)(-(v0.screenpos[2].ToFloat32() * w0 +
+                            v1.screenpos[2].ToFloat32() * w1 +
+                            v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum);
+                u16 ref_z = GetDepth(x >> 4, y >> 4);
+
+                bool pass = false;
+
+                switch (registers.output_merger.depth_test_func) {
+                case registers.output_merger.Always:
+                    pass = true;
+                    break;
+
+                case registers.output_merger.LessThan:
+                    pass = z < ref_z;
+                    break;
+
+                case registers.output_merger.GreaterThan:
+                    pass = z > ref_z;
+                    break;
+
+                default:
+                    LOG_ERROR(HW_GPU, "Unknown depth test function %x", registers.output_merger.depth_test_func.Value());
+                    break;
+                }
+
+                if (!pass)
+                    continue;
+
+                if (registers.output_merger.depth_write_enable)
+                    SetDepth(x >> 4, y >> 4, z);
+            }
+
+            auto dest = GetPixel(x >> 4, y >> 4);
+
+            if (registers.output_merger.alphablend_enable) {
+                auto params = registers.output_merger.alpha_blending;
+
+                auto LookupFactorRGB = [&](decltype(params)::BlendFactor factor) -> Math::Vec3<u8> {
+                    switch(factor) {
+                    case params.Zero:
+                        return Math::Vec3<u8>(0, 0, 0);
+
+                    case params.One:
+                        return Math::Vec3<u8>(255, 255, 255);
+
+                    case params.SourceAlpha:
+                        return Math::MakeVec(combiner_output.a(), combiner_output.a(), combiner_output.a());
+
+                    case params.OneMinusSourceAlpha:
+                        return Math::Vec3<u8>(255-combiner_output.a(), 255-combiner_output.a(), 255-combiner_output.a());
+
+                    default:
+                        LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor);
+                        exit(0);
+                        break;
+                    }
+                };
+
+                auto LookupFactorA = [&](decltype(params)::BlendFactor factor) -> u8 {
+                    switch(factor) {
+                    case params.Zero:
+                        return 0;
+
+                    case params.One:
+                        return 255;
+
+                    case params.SourceAlpha:
+                        return combiner_output.a();
+
+                    case params.OneMinusSourceAlpha:
+                        return 255 - combiner_output.a();
+
+                    default:
+                        LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor);
+                        exit(0);
+                        break;
+                    }
+                };
+
+                auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb),
+                                               LookupFactorA(params.factor_source_a));
+                auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb),
+                                               LookupFactorA(params.factor_dest_a));
+
+                switch (params.blend_equation_rgb) {
+                case params.Add:
+                {
+                    auto result = (combiner_output * srcfactor + dest * dstfactor) / 255;
+                    result.r() = std::min(255, result.r());
+                    result.g() = std::min(255, result.g());
+                    result.b() = std::min(255, result.b());
+                    combiner_output = result.Cast<u8>();
+                    break;
+                }
+
+                default:
+                    LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", params.blend_equation_rgb.Value());
+                    exit(0);
+                }
+            } else {
+                LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op);
+                exit(0);
+            }
 
             DrawPixel(x >> 4, y >> 4, combiner_output);
         }
diff --git a/src/video_core/utils.h b/src/video_core/utils.h
index 63ebccbde..6fd640425 100644
--- a/src/video_core/utils.h
+++ b/src/video_core/utils.h
@@ -8,32 +8,6 @@
 
 #include "common/common_types.h"
 
-namespace FormatPrecision {
-
-/// Adjust RGBA8 color with RGBA6 precision
-static inline u32 rgba8_with_rgba6(u32 src) {
-    u32 color = src;
-    color &= 0xFCFCFCFC;
-    color |= (color >> 6) & 0x03030303;
-    return color;
-}
-
-/// Adjust RGBA8 color with RGB565 precision
-static inline u32 rgba8_with_rgb565(u32 src) {
-    u32 color = (src & 0xF8FCF8);
-    color |= (color >> 5) & 0x070007;
-    color |= (color >> 6) & 0x000300;
-    color |= 0xFF000000;
-    return color;
-}
-
-/// Adjust Z24 depth value with Z16 precision
-static inline u32 z24_with_z16(u32 src) {
-    return (src & 0xFFFF00) | (src >> 16);
-}
-
-} // namespace
-
 namespace VideoCore {
 
 /// Structure for the TGA texture format (for dumping)
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index bed5081a0..ff825e2e1 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -30,6 +30,8 @@ static struct {
     Math::Vec4<float24> f[96];
 
     std::array<bool,16> b;
+
+    std::array<Math::Vec4<u8>,4> i;
 } shader_uniforms;
 
 // TODO: Not sure where the shader binary and swizzle patterns are supposed to be loaded to!
@@ -37,33 +39,31 @@ static struct {
 static std::array<u32, 1024> shader_memory;
 static std::array<u32, 1024> swizzle_data;
 
-void SubmitShaderMemoryChange(u32 addr, u32 value)
-{
+void SubmitShaderMemoryChange(u32 addr, u32 value) {
     shader_memory[addr] = value;
 }
 
-void SubmitSwizzleDataChange(u32 addr, u32 value)
-{
+void SubmitSwizzleDataChange(u32 addr, u32 value) {
     swizzle_data[addr] = value;
 }
 
-Math::Vec4<float24>& GetFloatUniform(u32 index)
-{
+Math::Vec4<float24>& GetFloatUniform(u32 index) {
     return shader_uniforms.f[index];
 }
 
-bool& GetBoolUniform(u32 index)
-{
+bool& GetBoolUniform(u32 index) {
     return shader_uniforms.b[index];
 }
 
-const std::array<u32, 1024>& GetShaderBinary()
-{
+Math::Vec4<u8>& GetIntUniform(u32 index) {
+    return shader_uniforms.i[index];
+}
+
+const std::array<u32, 1024>& GetShaderBinary() {
     return shader_memory;
 }
 
-const std::array<u32, 1024>& GetSwizzlePatterns()
-{
+const std::array<u32, 1024>& GetSwizzlePatterns() {
     return swizzle_data;
 }
 
@@ -437,8 +437,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
     }
 }
 
-OutputVertex RunShader(const InputVertex& input, int num_attributes)
-{
+OutputVertex RunShader(const InputVertex& input, int num_attributes) {
     VertexShaderState state;
 
     const u32* main = &shader_memory[registers.vs_main_offset];
diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h
index af3fb2a2f..3a68a3409 100644
--- a/src/video_core/vertex_shader.h
+++ b/src/video_core/vertex_shader.h
@@ -73,6 +73,7 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes);
 
 Math::Vec4<float24>& GetFloatUniform(u32 index);
 bool& GetBoolUniform(u32 index);
+Math::Vec4<u8>& GetIntUniform(u32 index);
 
 const std::array<u32, 1024>& GetShaderBinary();
 const std::array<u32, 1024>& GetSwizzlePatterns();