25 files changed, 878 insertions, 513 deletions
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 2bc747102..db99ce666 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -75,8 +75,6 @@ static void InitScreenCoordinates(OutputVertex& vtx)
     viewport.halfsize_y = float24::FromRaw(regs.viewport_size_y);
     viewport.offset_x   = float24::FromFloat32(static_cast<float>(regs.viewport_corner.x));
     viewport.offset_y   = float24::FromFloat32(static_cast<float>(regs.viewport_corner.y));
-    viewport.zscale     = float24::FromRaw(regs.viewport_depth_range);
-    viewport.offset_z   = float24::FromRaw(regs.viewport_depth_far_plane);
 
     float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w;
     vtx.color *= inv_w;
@@ -89,7 +87,7 @@ static void InitScreenCoordinates(OutputVertex& vtx)
 
     vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
     vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
-    vtx.screenpos[2] = viewport.offset_z + vtx.pos.z * inv_w * viewport.zscale;
+    vtx.screenpos[2] = vtx.pos.z * inv_w;
 }
 
 void ProcessTriangle(const OutputVertex &v0, const OutputVertex &v1, const OutputVertex &v2) {
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index dd1379503..19e03adf4 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -128,7 +128,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
                 // TODO: Verify that this actually modifies the register!
                 if (setup.index < 15) {
-                    g_state.vs.default_attributes[setup.index] = attribute;
+                    g_state.vs_default_attributes[setup.index] = attribute;
                     setup.index++;
                 } else {
                     // Put each attribute into an immediate input buffer.
@@ -144,12 +144,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                         immediate_attribute_id = 0;
 
                         Shader::UnitState<false> shader_unit;
-                        Shader::Setup();
+                        g_state.vs.Setup();
 
                         // Send to vertex shader
                         if (g_debug_context)
                             g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, static_cast<void*>(&immediate_input));
-                        Shader::OutputVertex output = Shader::Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1);
+                        g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1);
+                        Shader::OutputVertex output_vertex = shader_unit.output_registers.ToVertex(regs.vs);
 
                         // Send to renderer
                         using Pica::Shader::OutputVertex;
@@ -157,7 +158,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                             VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
                         };
 
-                        g_state.primitive_assembler.SubmitVertex(output, AddTriangle);
+                        g_state.primitive_assembler.SubmitVertex(output_vertex, AddTriangle);
                     }
                 }
             }
@@ -199,9 +200,8 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
             // Processes information about internal vertex attributes to figure out how a vertex is loaded.
             // Later, these can be compiled and cached.
-            VertexLoader loader;
             const u32 base_address = regs.vertex_attributes.GetPhysicalBaseAddress();
-            loader.Setup(regs);
+            VertexLoader loader(regs);
 
             // Load vertices
             bool is_indexed = (id == PICA_REG_INDEX(trigger_draw_indexed));
@@ -231,13 +231,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
             // The size has been tuned for optimal balance between hit-rate and the cost of lookup
             const size_t VERTEX_CACHE_SIZE = 32;
             std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
-            std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache;
+            std::array<Shader::OutputRegisters, VERTEX_CACHE_SIZE> vertex_cache;
 
             unsigned int vertex_cache_pos = 0;
             vertex_cache_ids.fill(-1);
 
             Shader::UnitState<false> shader_unit;
-            Shader::Setup();
+            g_state.vs.Setup();
 
             for (unsigned int index = 0; index < regs.num_vertices; ++index)
             {
@@ -249,7 +249,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                 ASSERT(vertex != -1);
 
                 bool vertex_cache_hit = false;
-                Shader::OutputVertex output;
+                Shader::OutputRegisters output_registers;
 
                 if (is_indexed) {
                     if (g_debug_context && Pica::g_debug_context->recorder) {
@@ -259,7 +259,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
                     for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
                         if (vertex == vertex_cache_ids[i]) {
-                            output = vertex_cache[i];
+                            output_registers = vertex_cache[i];
                             vertex_cache_hit = true;
                             break;
                         }
@@ -274,15 +274,19 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                     // Send to vertex shader
                     if (g_debug_context)
                         g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input);
-                    output = Shader::Run(shader_unit, input, loader.GetNumTotalAttributes());
+                    g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes());
+                    output_registers = shader_unit.output_registers;
 
                     if (is_indexed) {
-                        vertex_cache[vertex_cache_pos] = output;
+                        vertex_cache[vertex_cache_pos] = output_registers;
                         vertex_cache_ids[vertex_cache_pos] = vertex;
                         vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
                     }
                 }
 
+                // Retreive vertex from register data
+                Shader::OutputVertex output_vertex = output_registers.ToVertex(regs.vs);
+
                 // Send to renderer
                 using Pica::Shader::OutputVertex;
                 auto AddTriangle = [](
@@ -290,7 +294,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                     VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
                 };
 
-                primitive_assembler.SubmitVertex(output, AddTriangle);
+                primitive_assembler.SubmitVertex(output_vertex, AddTriangle);
             }
 
             for (auto& range : memory_accesses.ranges) {
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index fb20f81dd..871368323 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -208,11 +208,12 @@ void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, c
 
     // TODO: Reduce the amount of binary code written to relevant portions
     dvlp.binary_offset = write_offset - dvlp_offset;
-    dvlp.binary_size_words = setup.program_code.size();
-    QueueForWriting(reinterpret_cast<const u8*>(setup.program_code.data()), setup.program_code.size() * sizeof(u32));
+    dvlp.binary_size_words = static_cast<uint32_t>(setup.program_code.size());
+    QueueForWriting(reinterpret_cast<const u8*>(setup.program_code.data()),
+                    static_cast<u32>(setup.program_code.size()) * sizeof(u32));
 
     dvlp.swizzle_info_offset = write_offset - dvlp_offset;
-    dvlp.swizzle_info_num_entries = setup.swizzle_data.size();
+    dvlp.swizzle_info_num_entries = static_cast<uint32_t>(setup.swizzle_data.size());
     u32 dummy = 0;
     for (unsigned int i = 0; i < setup.swizzle_data.size(); ++i) {
         QueueForWriting(reinterpret_cast<const u8*>(&setup.swizzle_data[i]), sizeof(setup.swizzle_data[i]));
@@ -264,7 +265,7 @@ void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, c
             constant_table.emplace_back(constant);
     }
     dvle.constant_table_offset = write_offset - dvlb.dvle_offset;
-    dvle.constant_table_size = constant_table.size();
+    dvle.constant_table_size = static_cast<uint32_t>(constant_table.size());
     for (const auto& constant : constant_table) {
         QueueForWriting(reinterpret_cast<const u8*>(&constant), sizeof(constant));
     }
@@ -695,106 +696,125 @@ finalise:
 #endif
 }
 
-void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages)
-{
+static std::string ReplacePattern(const std::string& input, const std::string& pattern, const std::string& replacement) {
+    size_t start = input.find(pattern);
+    if (start == std::string::npos)
+        return input;
+
+    std::string ret = input;
+    ret.replace(start, pattern.length(), replacement);
+    return ret;
+}
+
+static std::string GetTevStageConfigSourceString(const Pica::Regs::TevStageConfig::Source& source) {
     using Source = Pica::Regs::TevStageConfig::Source;
+    static const std::map<Source, std::string> source_map = {
+        { Source::PrimaryColor,           "PrimaryColor" },
+        { Source::PrimaryFragmentColor,   "PrimaryFragmentColor" },
+        { Source::SecondaryFragmentColor, "SecondaryFragmentColor" },
+        { Source::Texture0,               "Texture0" },
+        { Source::Texture1,               "Texture1" },
+        { Source::Texture2,               "Texture2" },
+        { Source::Texture3,               "Texture3" },
+        { Source::PreviousBuffer,         "PreviousBuffer" },
+        { Source::Constant,               "Constant" },
+        { Source::Previous,               "Previous" },
+    };
+
+    const auto src_it = source_map.find(source);
+    if (src_it == source_map.end())
+        return "Unknown";
+
+    return src_it->second;
+}
+
+static std::string GetTevStageConfigColorSourceString(const Pica::Regs::TevStageConfig::Source& source, const Pica::Regs::TevStageConfig::ColorModifier modifier) {
     using ColorModifier = Pica::Regs::TevStageConfig::ColorModifier;
+    static const std::map<ColorModifier, std::string> color_modifier_map = {
+        { ColorModifier::SourceColor,         "%source.rgb" },
+        { ColorModifier::OneMinusSourceColor, "(1.0 - %source.rgb)" },
+        { ColorModifier::SourceAlpha,         "%source.aaa" },
+        { ColorModifier::OneMinusSourceAlpha, "(1.0 - %source.aaa)" },
+        { ColorModifier::SourceRed,           "%source.rrr" },
+        { ColorModifier::OneMinusSourceRed,   "(1.0 - %source.rrr)" },
+        { ColorModifier::SourceGreen,         "%source.ggg" },
+        { ColorModifier::OneMinusSourceGreen, "(1.0 - %source.ggg)" },
+        { ColorModifier::SourceBlue,          "%source.bbb" },
+        { ColorModifier::OneMinusSourceBlue,  "(1.0 - %source.bbb)" },
+    };
+
+    auto src_str = GetTevStageConfigSourceString(source);
+    auto modifier_it = color_modifier_map.find(modifier);
+    std::string modifier_str = "%source.????";
+    if (modifier_it != color_modifier_map.end())
+        modifier_str = modifier_it->second;
+
+    return ReplacePattern(modifier_str, "%source", src_str);
+}
+
+static std::string GetTevStageConfigAlphaSourceString(const Pica::Regs::TevStageConfig::Source& source, const Pica::Regs::TevStageConfig::AlphaModifier modifier) {
     using AlphaModifier = Pica::Regs::TevStageConfig::AlphaModifier;
+    static const std::map<AlphaModifier, std::string> alpha_modifier_map = {
+        { AlphaModifier::SourceAlpha,         "%source.a" },
+        { AlphaModifier::OneMinusSourceAlpha, "(1.0 - %source.a)" },
+        { AlphaModifier::SourceRed,           "%source.r" },
+        { AlphaModifier::OneMinusSourceRed,   "(1.0 - %source.r)" },
+        { AlphaModifier::SourceGreen,         "%source.g" },
+        { AlphaModifier::OneMinusSourceGreen, "(1.0 - %source.g)" },
+        { AlphaModifier::SourceBlue,          "%source.b" },
+        { AlphaModifier::OneMinusSourceBlue,  "(1.0 - %source.b)" },
+    };
+
+    auto src_str = GetTevStageConfigSourceString(source);
+    auto modifier_it = alpha_modifier_map.find(modifier);
+    std::string modifier_str = "%source.????";
+    if (modifier_it != alpha_modifier_map.end())
+        modifier_str = modifier_it->second;
+
+    return ReplacePattern(modifier_str, "%source", src_str);
+}
+
+static std::string GetTevStageConfigOperationString(const Pica::Regs::TevStageConfig::Operation& operation) {
     using Operation = Pica::Regs::TevStageConfig::Operation;
+    static const std::map<Operation, std::string> combiner_map = {
+        { Operation::Replace,         "%source1" },
+        { Operation::Modulate,        "(%source1 * %source2)" },
+        { Operation::Add,             "(%source1 + %source2)" },
+        { Operation::AddSigned,       "(%source1 + %source2) - 0.5" },
+        { Operation::Lerp,            "lerp(%source1, %source2, %source3)" },
+        { Operation::Subtract,        "(%source1 - %source2)" },
+        { Operation::Dot3_RGB,        "dot(%source1, %source2)" },
+        { Operation::MultiplyThenAdd, "((%source1 * %source2) + %source3)" },
+        { Operation::AddThenMultiply, "((%source1 + %source2) * %source3)" },
+    };
 
-    std::string stage_info = "Tev setup:\n";
-    for (size_t index = 0; index < stages.size(); ++index) {
-        const auto& tev_stage = stages[index];
+    const auto op_it = combiner_map.find(operation);
+    if (op_it == combiner_map.end())
+        return "Unknown op (%source1, %source2, %source3)";
 
-        static const std::map<Source, std::string> source_map = {
-            { Source::PrimaryColor, "PrimaryColor" },
-            { Source::Texture0, "Texture0" },
-            { Source::Texture1, "Texture1" },
-            { Source::Texture2, "Texture2" },
-            { Source::Constant, "Constant" },
-            { Source::Previous, "Previous" },
-        };
+    return op_it->second;
+}
 
-        static const std::map<ColorModifier, std::string> color_modifier_map = {
-            { ColorModifier::SourceColor, { "%source.rgb" } },
-            { ColorModifier::SourceAlpha, { "%source.aaa" } },
-        };
-        static const std::map<AlphaModifier, std::string> alpha_modifier_map = {
-            { AlphaModifier::SourceAlpha, "%source.a" },
-            { AlphaModifier::OneMinusSourceAlpha, "(255 - %source.a)" },
-        };
+std::string GetTevStageConfigColorCombinerString(const Pica::Regs::TevStageConfig& tev_stage) {
+    auto op_str = GetTevStageConfigOperationString(tev_stage.color_op);
+    op_str = ReplacePattern(op_str, "%source1", GetTevStageConfigColorSourceString(tev_stage.color_source1, tev_stage.color_modifier1));
+    op_str = ReplacePattern(op_str, "%source2", GetTevStageConfigColorSourceString(tev_stage.color_source2, tev_stage.color_modifier2));
+    return   ReplacePattern(op_str, "%source3", GetTevStageConfigColorSourceString(tev_stage.color_source3, tev_stage.color_modifier3));
+}
 
-        static const std::map<Operation, std::string> combiner_map = {
-            { Operation::Replace, "%source1" },
-            { Operation::Modulate, "(%source1 * %source2) / 255" },
-            { Operation::Add, "(%source1 + %source2)" },
-            { Operation::Lerp, "lerp(%source1, %source2, %source3)" },
-        };
+std::string GetTevStageConfigAlphaCombinerString(const Pica::Regs::TevStageConfig& tev_stage) {
+    auto op_str = GetTevStageConfigOperationString(tev_stage.alpha_op);
+    op_str = ReplacePattern(op_str, "%source1", GetTevStageConfigAlphaSourceString(tev_stage.alpha_source1, tev_stage.alpha_modifier1));
+    op_str = ReplacePattern(op_str, "%source2", GetTevStageConfigAlphaSourceString(tev_stage.alpha_source2, tev_stage.alpha_modifier2));
+    return   ReplacePattern(op_str, "%source3", GetTevStageConfigAlphaSourceString(tev_stage.alpha_source3, tev_stage.alpha_modifier3));
+}
 
-        static auto ReplacePattern =
-                [](const std::string& input, const std::string& pattern, const std::string& replacement) -> std::string {
-                    size_t start = input.find(pattern);
-                    if (start == std::string::npos)
-                        return input;
-
-                    std::string ret = input;
-                    ret.replace(start, pattern.length(), replacement);
-                    return ret;
-                };
-        static auto GetColorSourceStr =
-                [](const Source& src, const ColorModifier& modifier) {
-                    auto src_it = source_map.find(src);
-                    std::string src_str = "Unknown";
-                    if (src_it != source_map.end())
-                        src_str = src_it->second;
-
-                    auto modifier_it = color_modifier_map.find(modifier);
-                    std::string modifier_str = "%source.????";
-                    if (modifier_it != color_modifier_map.end())
-                        modifier_str = modifier_it->second;
-
-                    return ReplacePattern(modifier_str, "%source", src_str);
-                };
-        static auto GetColorCombinerStr =
-                [](const Regs::TevStageConfig& tev_stage) {
-                    auto op_it = combiner_map.find(tev_stage.color_op);
-                    std::string op_str = "Unknown op (%source1, %source2, %source3)";
-                    if (op_it != combiner_map.end())
-                        op_str = op_it->second;
-
-                    op_str = ReplacePattern(op_str, "%source1", GetColorSourceStr(tev_stage.color_source1, tev_stage.color_modifier1));
-                    op_str = ReplacePattern(op_str, "%source2", GetColorSourceStr(tev_stage.color_source2, tev_stage.color_modifier2));
-                    return   ReplacePattern(op_str, "%source3", GetColorSourceStr(tev_stage.color_source3, tev_stage.color_modifier3));
-                };
-        static auto GetAlphaSourceStr =
-                [](const Source& src, const AlphaModifier& modifier) {
-                    auto src_it = source_map.find(src);
-                    std::string src_str = "Unknown";
-                    if (src_it != source_map.end())
-                        src_str = src_it->second;
-
-                    auto modifier_it = alpha_modifier_map.find(modifier);
-                    std::string modifier_str = "%source.????";
-                    if (modifier_it != alpha_modifier_map.end())
-                        modifier_str = modifier_it->second;
-
-                    return ReplacePattern(modifier_str, "%source", src_str);
-                };
-        static auto GetAlphaCombinerStr =
-                [](const Regs::TevStageConfig& tev_stage) {
-                    auto op_it = combiner_map.find(tev_stage.alpha_op);
-                    std::string op_str = "Unknown op (%source1, %source2, %source3)";
-                    if (op_it != combiner_map.end())
-                        op_str = op_it->second;
-
-                    op_str = ReplacePattern(op_str, "%source1", GetAlphaSourceStr(tev_stage.alpha_source1, tev_stage.alpha_modifier1));
-                    op_str = ReplacePattern(op_str, "%source2", GetAlphaSourceStr(tev_stage.alpha_source2, tev_stage.alpha_modifier2));
-                    return   ReplacePattern(op_str, "%source3", GetAlphaSourceStr(tev_stage.alpha_source3, tev_stage.alpha_modifier3));
-                };
-
-        stage_info += "Stage " + std::to_string(index) + ": " + GetColorCombinerStr(tev_stage) + "   " + GetAlphaCombinerStr(tev_stage) + "\n";
+void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig, 6>& stages) {
+    std::string stage_info = "Tev setup:\n";
+    for (size_t index = 0; index < stages.size(); ++index) {
+        const auto& tev_stage = stages[index];
+        stage_info += "Stage " + std::to_string(index) + ": " + GetTevStageConfigColorCombinerString(tev_stage) + "   " + GetTevStageConfigAlphaCombinerString(tev_stage) + "\n";
     }
-
     LOG_TRACE(HW_GPU, "%s", stage_info.c_str());
 }
 
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index f628292a4..92e9734ae 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -224,7 +224,11 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int s, int t, const Texture
 
 void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data);
 
-void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages);
+std::string GetTevStageConfigColorCombinerString(const Pica::Regs::TevStageConfig& tev_stage);
+std::string GetTevStageConfigAlphaCombinerString(const Pica::Regs::TevStageConfig& tev_stage);
+
+/// Dumps the Tev stage config to log at trace level
+void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig, 6>& stages);
 
 /**
  * Used in the vertex loader to merge access records. TODO: Investigate if actually useful.
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index be82cf4b5..ec78f9593 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -500,7 +500,7 @@ void Init() {
 }
 
 void Shutdown() {
-    Shader::Shutdown();
+    Shader::ClearCache();
 }
 
 template <typename T>
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 5891fb72a..544ea037f 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -70,7 +70,7 @@ struct Regs {
     INSERT_PADDING_WORDS(0x9);
 
     BitField<0, 24, u32> viewport_depth_range; // float24
-    BitField<0, 24, u32> viewport_depth_far_plane; // float24
+    BitField<0, 24, u32> viewport_depth_near_plane; // float24
 
     BitField<0, 3, u32> vs_output_total;
 
@@ -122,9 +122,31 @@ struct Regs {
         BitField<16, 10, s32> y;
     } viewport_corner;
 
-    INSERT_PADDING_WORDS(0x17);
+    INSERT_PADDING_WORDS(0x1);
+
+    //TODO: early depth
+    INSERT_PADDING_WORDS(0x1);
+
+    INSERT_PADDING_WORDS(0x2);
+
+    enum DepthBuffering : u32 {
+        WBuffering  = 0,
+        ZBuffering  = 1,
+    };
+    BitField< 0, 1, DepthBuffering> depthmap_enable;
+
+    INSERT_PADDING_WORDS(0x12);
 
     struct TextureConfig {
+        enum TextureType : u32 {
+            Texture2D    = 0,
+            TextureCube  = 1,
+            Shadow2D     = 2,
+            Projection2D = 3,
+            ShadowCube   = 4,
+            Disabled     = 5,
+        };
+
         enum WrapMode : u32 {
             ClampToEdge    = 0,
             ClampToBorder  = 1,
@@ -155,6 +177,7 @@ struct Regs {
             BitField< 2, 1, TextureFilter> min_filter;
             BitField< 8, 2, WrapMode> wrap_t;
             BitField<12, 2, WrapMode> wrap_s;
+            BitField<28, 2, TextureType> type; ///< @note Only valid for texture 0 according to 3DBrew.
         };
 
         INSERT_PADDING_WORDS(0x1);
@@ -764,23 +787,21 @@ struct Regs {
             LightColor diffuse;     // material.diffuse * light.diffuse
             LightColor ambient;     // material.ambient * light.ambient
 
-            struct {
-                // Encoded as 16-bit floating point
-                union {
-                    BitField< 0, 16, u32> x;
-                    BitField<16, 16, u32> y;
-                };
-                union {
-                    BitField< 0, 16, u32> z;
-                };
+            // Encoded as 16-bit floating point
+            union {
+                BitField< 0, 16, u32> x;
+                BitField<16, 16, u32> y;
+            };
+            union {
+                BitField< 0, 16, u32> z;
+            };
 
-                INSERT_PADDING_WORDS(0x3);
+            INSERT_PADDING_WORDS(0x3);
 
-                union {
-                    BitField<0, 1, u32> directional;
-                    BitField<1, 1, u32> two_sided_diffuse; // When disabled, clamp dot-product to 0
-                };
-            };
+            union {
+                BitField<0, 1, u32> directional;
+                BitField<1, 1, u32> two_sided_diffuse; // When disabled, clamp dot-product to 0
+            } config;
 
             BitField<0, 20, u32> dist_atten_bias;
             BitField<0, 20, u32> dist_atten_scale;
@@ -801,7 +822,7 @@ struct Regs {
             BitField<27, 1, u32> clamp_highlights;
             BitField<28, 2, LightingBumpMode> bump_mode;
             BitField<30, 1, u32> disable_bump_renorm;
-        };
+        } config0;
 
         union {
             BitField<16, 1, u32> disable_lut_d0;
@@ -822,13 +843,13 @@ struct Regs {
             BitField<29, 1, u32> disable_dist_atten_light_5;
             BitField<30, 1, u32> disable_dist_atten_light_6;
             BitField<31, 1, u32> disable_dist_atten_light_7;
-        };
+        } config1;
 
         bool IsDistAttenDisabled(unsigned index) const {
-            const unsigned disable[] = { disable_dist_atten_light_0, disable_dist_atten_light_1,
-                                         disable_dist_atten_light_2, disable_dist_atten_light_3,
-                                         disable_dist_atten_light_4, disable_dist_atten_light_5,
-                                         disable_dist_atten_light_6, disable_dist_atten_light_7 };
+            const unsigned disable[] = { config1.disable_dist_atten_light_0, config1.disable_dist_atten_light_1,
+                                         config1.disable_dist_atten_light_2, config1.disable_dist_atten_light_3,
+                                         config1.disable_dist_atten_light_4, config1.disable_dist_atten_light_5,
+                                         config1.disable_dist_atten_light_6, config1.disable_dist_atten_light_7 };
             return disable[index] != 0;
         }
 
@@ -1279,10 +1300,11 @@ ASSERT_REG_POSITION(cull_mode, 0x40);
 ASSERT_REG_POSITION(viewport_size_x, 0x41);
 ASSERT_REG_POSITION(viewport_size_y, 0x43);
 ASSERT_REG_POSITION(viewport_depth_range, 0x4d);
-ASSERT_REG_POSITION(viewport_depth_far_plane, 0x4e);
+ASSERT_REG_POSITION(viewport_depth_near_plane, 0x4e);
 ASSERT_REG_POSITION(vs_output_attributes[0], 0x50);
 ASSERT_REG_POSITION(vs_output_attributes[1], 0x51);
 ASSERT_REG_POSITION(viewport_corner, 0x68);
+ASSERT_REG_POSITION(depthmap_enable, 0x6D);
 ASSERT_REG_POSITION(texture0_enable, 0x80);
 ASSERT_REG_POSITION(texture0, 0x81);
 ASSERT_REG_POSITION(texture0_format, 0x8e);
diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h
index bbecad850..495174c25 100644
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -25,6 +25,8 @@ struct State {
     Shader::ShaderSetup vs;
     Shader::ShaderSetup gs;
 
+    std::array<Math::Vec4<float24>, 16> vs_default_attributes;
+
     struct {
         union LutEntry {
             // Used for raw access
@@ -56,7 +58,7 @@ struct State {
         // Used to buffer partial vertices for immediate-mode rendering.
         Shader::InputVertex input_vertex;
         // Index of the next attribute to be loaded into `input_vertex`.
-        int current_attribute = 0;
+        u32 current_attribute = 0;
     } immediate;
 
     // This is constructed with a dummy triangle topology
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index df67b9081..65168f05a 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -442,8 +442,33 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
 
                 DEBUG_ASSERT(0 != texture.config.address);
 
-                int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
-                int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
+                float24 u = uv[i].u();
+                float24 v = uv[i].v();
+
+                // Only unit 0 respects the texturing type (according to 3DBrew)
+                // TODO: Refactor so cubemaps and shadowmaps can be handled
+                if (i == 0) {
+                    switch(texture.config.type) {
+                    case Regs::TextureConfig::Texture2D:
+                        break;
+                    case Regs::TextureConfig::Projection2D: {
+                        auto tc0_w = GetInterpolatedAttribute(v0.tc0_w, v1.tc0_w, v2.tc0_w);
+                        u /= tc0_w;
+                        v /= tc0_w;
+                        break;
+                    }
+                    default:
+                        // TODO: Change to LOG_ERROR when more types are handled.
+                        LOG_DEBUG(HW_GPU, "Unhandled texture type %x", (int)texture.config.type);
+                        UNIMPLEMENTED();
+                        break;
+                    }
+                }
+
+                int s = (int)(u * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
+                int t = (int)(v * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
+
+
                 static auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
                     switch (mode) {
                         case Regs::TextureConfig::ClampToEdge:
@@ -862,10 +887,30 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
                 }
             }
 
+            // interpolated_z = z / w
+            float interpolated_z_over_w = (v0.screenpos[2].ToFloat32() * w0 +
+                                           v1.screenpos[2].ToFloat32() * w1 +
+                                           v2.screenpos[2].ToFloat32() * w2) / wsum;
+
+            // Not fully accurate. About 3 bits in precision are missing.
+            // Z-Buffer (z / w * scale + offset)
+            float depth_scale = float24::FromRaw(regs.viewport_depth_range).ToFloat32();
+            float depth_offset = float24::FromRaw(regs.viewport_depth_near_plane).ToFloat32();
+            float depth = interpolated_z_over_w * depth_scale + depth_offset;
+
+            // Potentially switch to W-Buffer
+            if (regs.depthmap_enable == Pica::Regs::DepthBuffering::WBuffering) {
+
+                // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
+                depth *= interpolated_w_inverse.ToFloat32() * wsum;
+            }
+
+            // Clamp the result
+            depth = MathUtil::Clamp(depth, 0.0f, 1.0f);
+
+            // Convert float to integer
             unsigned num_bits = Regs::DepthBitsPerPixel(regs.framebuffer.depth_format);
-            u32 z = (u32)((v0.screenpos[2].ToFloat32() * w0 +
-                           v1.screenpos[2].ToFloat32() * w1 +
-                           v2.screenpos[2].ToFloat32() * w2) * ((1 << num_bits) - 1) / wsum);
+            u32 z = (u32)(depth * ((1 << num_bits) - 1));
 
             if (output_merger.depth_test_enable) {
                 u32 ref_z = GetDepth(x >> 4, y >> 4);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 519d81aeb..931c34a37 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -76,6 +76,9 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
     glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD1);
     glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD2);
 
+    glVertexAttribPointer(GLShader::ATTRIBUTE_TEXCOORD0_W, 1, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex), (GLvoid*)offsetof(HardwareVertex, tex_coord0_w));
+    glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD0_W);
+
     glVertexAttribPointer(GLShader::ATTRIBUTE_NORMQUAT, 4, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex), (GLvoid*)offsetof(HardwareVertex, normquat));
     glEnableVertexAttribArray(GLShader::ATTRIBUTE_NORMQUAT);
 
@@ -93,7 +96,7 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
     state.Apply();
 
     for (size_t i = 0; i < lighting_luts.size(); ++i) {
-        glActiveTexture(GL_TEXTURE3 + i);
+        glActiveTexture(static_cast<GLenum>(GL_TEXTURE3 + i));
         glTexImage1D(GL_TEXTURE_1D, 0, GL_RGBA32F, 256, 0, GL_RGBA, GL_FLOAT, nullptr);
         glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
         glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
@@ -101,7 +104,6 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
 
     // Sync fixed function OpenGL state
     SyncCullMode();
-    SyncDepthModifiers();
     SyncBlendEnabled();
     SyncBlendFuncs();
     SyncBlendColor();
@@ -256,8 +258,15 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
 
     // Depth modifiers
     case PICA_REG_INDEX(viewport_depth_range):
-    case PICA_REG_INDEX(viewport_depth_far_plane):
-        SyncDepthModifiers();
+        SyncDepthScale();
+        break;
+    case PICA_REG_INDEX(viewport_depth_near_plane):
+        SyncDepthOffset();
+        break;
+
+    // Depth buffering
+    case PICA_REG_INDEX(depthmap_enable):
+        shader_dirty = true;
         break;
 
     // Blending
@@ -314,6 +323,11 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
         SyncLogicOp();
         break;
 
+    // Texture 0 type
+    case PICA_REG_INDEX(texture0.type):
+        shader_dirty = true;
+        break;
+
     // TEV stages
     case PICA_REG_INDEX(tev_stage0.color_source1):
     case PICA_REG_INDEX(tev_stage0.color_modifier1):
@@ -366,6 +380,17 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
         SyncCombinerColor();
         break;
 
+    // Fragment lighting switches
+    case PICA_REG_INDEX(lighting.disable):
+    case PICA_REG_INDEX(lighting.num_lights):
+    case PICA_REG_INDEX(lighting.config0):
+    case PICA_REG_INDEX(lighting.config1):
+    case PICA_REG_INDEX(lighting.abs_lut_input):
+    case PICA_REG_INDEX(lighting.lut_input):
+    case PICA_REG_INDEX(lighting.lut_scale):
+    case PICA_REG_INDEX(lighting.light_enable):
+        break;
+
     // Fragment lighting specular 0 color
     case PICA_REG_INDEX_WORKAROUND(lighting.light[0].specular_0, 0x140 + 0 * 0x10):
         SyncLightSpecular0(0);
@@ -504,6 +529,70 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
         SyncLightPosition(7);
         break;
 
+    // Fragment lighting light source config
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[0].config, 0x149 + 0 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[1].config, 0x149 + 1 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[2].config, 0x149 + 2 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[3].config, 0x149 + 3 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[4].config, 0x149 + 4 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[5].config, 0x149 + 5 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[6].config, 0x149 + 6 * 0x10):
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[7].config, 0x149 + 7 * 0x10):
+        shader_dirty = true;
+        break;
+
+    // Fragment lighting distance attenuation bias
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[0].dist_atten_bias, 0x014A + 0 * 0x10):
+        SyncLightDistanceAttenuationBias(0);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[1].dist_atten_bias, 0x014A + 1 * 0x10):
+        SyncLightDistanceAttenuationBias(1);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[2].dist_atten_bias, 0x014A + 2 * 0x10):
+        SyncLightDistanceAttenuationBias(2);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[3].dist_atten_bias, 0x014A + 3 * 0x10):
+        SyncLightDistanceAttenuationBias(3);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[4].dist_atten_bias, 0x014A + 4 * 0x10):
+        SyncLightDistanceAttenuationBias(4);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[5].dist_atten_bias, 0x014A + 5 * 0x10):
+        SyncLightDistanceAttenuationBias(5);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[6].dist_atten_bias, 0x014A + 6 * 0x10):
+        SyncLightDistanceAttenuationBias(6);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[7].dist_atten_bias, 0x014A + 7 * 0x10):
+        SyncLightDistanceAttenuationBias(7);
+        break;
+
+    // Fragment lighting distance attenuation scale
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[0].dist_atten_scale, 0x014B + 0 * 0x10):
+        SyncLightDistanceAttenuationScale(0);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[1].dist_atten_scale, 0x014B + 1 * 0x10):
+        SyncLightDistanceAttenuationScale(1);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[2].dist_atten_scale, 0x014B + 2 * 0x10):
+        SyncLightDistanceAttenuationScale(2);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[3].dist_atten_scale, 0x014B + 3 * 0x10):
+        SyncLightDistanceAttenuationScale(3);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[4].dist_atten_scale, 0x014B + 4 * 0x10):
+        SyncLightDistanceAttenuationScale(4);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[5].dist_atten_scale, 0x014B + 5 * 0x10):
+        SyncLightDistanceAttenuationScale(5);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[6].dist_atten_scale, 0x014B + 6 * 0x10):
+        SyncLightDistanceAttenuationScale(6);
+        break;
+    case PICA_REG_INDEX_WORKAROUND(lighting.light[7].dist_atten_scale, 0x014B + 7 * 0x10):
+        SyncLightDistanceAttenuationScale(7);
+        break;
+
     // Fragment lighting global ambient color (emission + ambient * ambient)
     case PICA_REG_INDEX_WORKAROUND(lighting.global_ambient, 0x1c0):
         SyncGlobalAmbient();
@@ -867,6 +956,8 @@ void RasterizerOpenGL::SetShader() {
         glUniformBlockBinding(current_shader->shader.handle, block_index, 0);
 
         // Update uniforms
+        SyncDepthScale();
+        SyncDepthOffset();
         SyncAlphaTest();
         SyncCombinerColor();
         auto& tev_stages = Pica::g_state.regs.GetTevStages();
@@ -880,6 +971,8 @@ void RasterizerOpenGL::SetShader() {
             SyncLightDiffuse(light_index);
             SyncLightAmbient(light_index);
             SyncLightPosition(light_index);
+            SyncLightDistanceAttenuationBias(light_index);
+            SyncLightDistanceAttenuationScale(light_index);
         }
     }
 }
@@ -909,13 +1002,20 @@ void RasterizerOpenGL::SyncCullMode() {
     }
 }
 
-void RasterizerOpenGL::SyncDepthModifiers() {
-    float depth_scale = -Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_range).ToFloat32();
-    float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_far_plane).ToFloat32() / 2.0f;
+void RasterizerOpenGL::SyncDepthScale() {
+    float depth_scale = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_range).ToFloat32();
+    if (depth_scale != uniform_block_data.data.depth_scale) {
+        uniform_block_data.data.depth_scale = depth_scale;
+        uniform_block_data.dirty = true;
+    }
+}
 
-    // TODO: Implement scale modifier
-    uniform_block_data.data.depth_offset = depth_offset;
-    uniform_block_data.dirty = true;
+void RasterizerOpenGL::SyncDepthOffset() {
+    float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_near_plane).ToFloat32();
+    if (depth_offset != uniform_block_data.data.depth_offset) {
+        uniform_block_data.data.depth_offset = depth_offset;
+        uniform_block_data.dirty = true;
+    }
 }
 
 void RasterizerOpenGL::SyncBlendEnabled() {
@@ -924,6 +1024,8 @@ void RasterizerOpenGL::SyncBlendEnabled() {
 
 void RasterizerOpenGL::SyncBlendFuncs() {
     const auto& regs = Pica::g_state.regs;
+    state.blend.rgb_equation = PicaToGL::BlendEquation(regs.output_merger.alpha_blending.blend_equation_rgb);
+    state.blend.a_equation = PicaToGL::BlendEquation(regs.output_merger.alpha_blending.blend_equation_a);
     state.blend.src_rgb_func = PicaToGL::BlendFunc(regs.output_merger.alpha_blending.factor_source_rgb);
     state.blend.dst_rgb_func = PicaToGL::BlendFunc(regs.output_merger.alpha_blending.factor_dest_rgb);
     state.blend.src_a_func = PicaToGL::BlendFunc(regs.output_merger.alpha_blending.factor_source_a);
@@ -1080,3 +1182,21 @@ void RasterizerOpenGL::SyncLightPosition(int light_index) {
         uniform_block_data.dirty = true;
     }
 }
+
+void RasterizerOpenGL::SyncLightDistanceAttenuationBias(int light_index) {
+    GLfloat dist_atten_bias = Pica::float20::FromRaw(Pica::g_state.regs.lighting.light[light_index].dist_atten_bias).ToFloat32();
+
+    if (dist_atten_bias != uniform_block_data.data.light_src[light_index].dist_atten_bias) {
+        uniform_block_data.data.light_src[light_index].dist_atten_bias = dist_atten_bias;
+        uniform_block_data.dirty = true;
+    }
+}
+
+void RasterizerOpenGL::SyncLightDistanceAttenuationScale(int light_index) {
+    GLfloat dist_atten_scale = Pica::float20::FromRaw(Pica::g_state.regs.lighting.light[light_index].dist_atten_scale).ToFloat32();
+
+    if (dist_atten_scale != uniform_block_data.data.light_src[light_index].dist_atten_scale) {
+        uniform_block_data.data.light_src[light_index].dist_atten_scale = dist_atten_scale;
+        uniform_block_data.dirty = true;
+    }
+}
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 82fa61742..bb7f20161 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -39,140 +39,181 @@ struct ScreenInfo;
  * directly accessing Pica registers. This should reduce the risk of bugs in shader generation where
  * Pica state is not being captured in the shader cache key, thereby resulting in (what should be)
  * two separate shaders sharing the same key.
+ *
+ * We use a union because "implicitly-defined copy/move constructor for a union X copies the object representation of X."
+ * and "implicitly-defined copy assignment operator for a union X copies the object representation (3.9) of X."
+ * = Bytewise copy instead of memberwise copy.
+ * This is important because the padding bytes are included in the hash and comparison between objects.
  */
-struct PicaShaderConfig {
+union PicaShaderConfig {
+
     /// Construct a PicaShaderConfig with the current Pica register configuration.
     static PicaShaderConfig CurrentConfig() {
         PicaShaderConfig res;
+
+        auto& state = res.state;
+        std::memset(&state, 0, sizeof(PicaShaderConfig::State));
+
         const auto& regs = Pica::g_state.regs;
 
-        res.alpha_test_func = regs.output_merger.alpha_test.enable ?
+        state.depthmap_enable = regs.depthmap_enable;
+
+        state.alpha_test_func = regs.output_merger.alpha_test.enable ?
             regs.output_merger.alpha_test.func.Value() : Pica::Regs::CompareFunc::Always;
 
+        state.texture0_type = regs.texture0.type;
+
         // Copy relevant tev stages fields.
         // We don't sync const_color here because of the high variance, it is a
         // shader uniform instead.
         const auto& tev_stages = regs.GetTevStages();
-        DEBUG_ASSERT(res.tev_stages.size() == tev_stages.size());
+        DEBUG_ASSERT(state.tev_stages.size() == tev_stages.size());
         for (size_t i = 0; i < tev_stages.size(); i++) {
             const auto& tev_stage = tev_stages[i];
-            res.tev_stages[i].sources_raw = tev_stage.sources_raw;
-            res.tev_stages[i].modifiers_raw = tev_stage.modifiers_raw;
-            res.tev_stages[i].ops_raw = tev_stage.ops_raw;
-            res.tev_stages[i].scales_raw = tev_stage.scales_raw;
+            state.tev_stages[i].sources_raw = tev_stage.sources_raw;
+            state.tev_stages[i].modifiers_raw = tev_stage.modifiers_raw;
+            state.tev_stages[i].ops_raw = tev_stage.ops_raw;
+            state.tev_stages[i].scales_raw = tev_stage.scales_raw;
         }
 
-        res.combiner_buffer_input =
+        state.combiner_buffer_input =
             regs.tev_combiner_buffer_input.update_mask_rgb.Value() |
             regs.tev_combiner_buffer_input.update_mask_a.Value() << 4;
 
         // Fragment lighting
 
-        res.lighting.enable = !regs.lighting.disable;
-        res.lighting.src_num = regs.lighting.num_lights + 1;
+        state.lighting.enable = !regs.lighting.disable;
+        state.lighting.src_num = regs.lighting.num_lights + 1;
 
-        for (unsigned light_index = 0; light_index < res.lighting.src_num; ++light_index) {
+        for (unsigned light_index = 0; light_index < state.lighting.src_num; ++light_index) {
             unsigned num = regs.lighting.light_enable.GetNum(light_index);
             const auto& light = regs.lighting.light[num];
-            res.lighting.light[light_index].num = num;
-            res.lighting.light[light_index].directional = light.directional != 0;
-            res.lighting.light[light_index].two_sided_diffuse = light.two_sided_diffuse != 0;
-            res.lighting.light[light_index].dist_atten_enable = !regs.lighting.IsDistAttenDisabled(num);
-            res.lighting.light[light_index].dist_atten_bias = Pica::float20::FromRaw(light.dist_atten_bias).ToFloat32();
-            res.lighting.light[light_index].dist_atten_scale = Pica::float20::FromRaw(light.dist_atten_scale).ToFloat32();
+            state.lighting.light[light_index].num = num;
+            state.lighting.light[light_index].directional = light.config.directional != 0;
+            state.lighting.light[light_index].two_sided_diffuse = light.config.two_sided_diffuse != 0;
+            state.lighting.light[light_index].dist_atten_enable = !regs.lighting.IsDistAttenDisabled(num);
         }
 
-        res.lighting.lut_d0.enable = regs.lighting.disable_lut_d0 == 0;
-        res.lighting.lut_d0.abs_input = regs.lighting.abs_lut_input.disable_d0 == 0;
-        res.lighting.lut_d0.type = regs.lighting.lut_input.d0.Value();
-        res.lighting.lut_d0.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d0);
-
-        res.lighting.lut_d1.enable = regs.lighting.disable_lut_d1 == 0;
-        res.lighting.lut_d1.abs_input = regs.lighting.abs_lut_input.disable_d1 == 0;
-        res.lighting.lut_d1.type = regs.lighting.lut_input.d1.Value();
-        res.lighting.lut_d1.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d1);
-
-        res.lighting.lut_fr.enable = regs.lighting.disable_lut_fr == 0;
-        res.lighting.lut_fr.abs_input = regs.lighting.abs_lut_input.disable_fr == 0;
-        res.lighting.lut_fr.type = regs.lighting.lut_input.fr.Value();
-        res.lighting.lut_fr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.fr);
-
-        res.lighting.lut_rr.enable = regs.lighting.disable_lut_rr == 0;
-        res.lighting.lut_rr.abs_input = regs.lighting.abs_lut_input.disable_rr == 0;
-        res.lighting.lut_rr.type = regs.lighting.lut_input.rr.Value();
-        res.lighting.lut_rr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rr);
-
-        res.lighting.lut_rg.enable = regs.lighting.disable_lut_rg == 0;
-        res.lighting.lut_rg.abs_input = regs.lighting.abs_lut_input.disable_rg == 0;
-        res.lighting.lut_rg.type = regs.lighting.lut_input.rg.Value();
-        res.lighting.lut_rg.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rg);
-
-        res.lighting.lut_rb.enable = regs.lighting.disable_lut_rb == 0;
-        res.lighting.lut_rb.abs_input = regs.lighting.abs_lut_input.disable_rb == 0;
-        res.lighting.lut_rb.type = regs.lighting.lut_input.rb.Value();
-        res.lighting.lut_rb.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rb);
-
-        res.lighting.config = regs.lighting.config;
-        res.lighting.fresnel_selector = regs.lighting.fresnel_selector;
-        res.lighting.bump_mode = regs.lighting.bump_mode;
-        res.lighting.bump_selector = regs.lighting.bump_selector;
-        res.lighting.bump_renorm = regs.lighting.disable_bump_renorm == 0;
-        res.lighting.clamp_highlights = regs.lighting.clamp_highlights != 0;
+        state.lighting.lut_d0.enable = regs.lighting.config1.disable_lut_d0 == 0;
+        state.lighting.lut_d0.abs_input = regs.lighting.abs_lut_input.disable_d0 == 0;
+        state.lighting.lut_d0.type = regs.lighting.lut_input.d0.Value();
+        state.lighting.lut_d0.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d0);
+
+        state.lighting.lut_d1.enable = regs.lighting.config1.disable_lut_d1 == 0;
+        state.lighting.lut_d1.abs_input = regs.lighting.abs_lut_input.disable_d1 == 0;
+        state.lighting.lut_d1.type = regs.lighting.lut_input.d1.Value();
+        state.lighting.lut_d1.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d1);
+
+        state.lighting.lut_fr.enable = regs.lighting.config1.disable_lut_fr == 0;
+        state.lighting.lut_fr.abs_input = regs.lighting.abs_lut_input.disable_fr == 0;
+        state.lighting.lut_fr.type = regs.lighting.lut_input.fr.Value();
+        state.lighting.lut_fr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.fr);
+
+        state.lighting.lut_rr.enable = regs.lighting.config1.disable_lut_rr == 0;
+        state.lighting.lut_rr.abs_input = regs.lighting.abs_lut_input.disable_rr == 0;
+        state.lighting.lut_rr.type = regs.lighting.lut_input.rr.Value();
+        state.lighting.lut_rr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rr);
+
+        state.lighting.lut_rg.enable = regs.lighting.config1.disable_lut_rg == 0;
+        state.lighting.lut_rg.abs_input = regs.lighting.abs_lut_input.disable_rg == 0;
+        state.lighting.lut_rg.type = regs.lighting.lut_input.rg.Value();
+        state.lighting.lut_rg.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rg);
+
+        state.lighting.lut_rb.enable = regs.lighting.config1.disable_lut_rb == 0;
+        state.lighting.lut_rb.abs_input = regs.lighting.abs_lut_input.disable_rb == 0;
+        state.lighting.lut_rb.type = regs.lighting.lut_input.rb.Value();
+        state.lighting.lut_rb.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rb);
+
+        state.lighting.config = regs.lighting.config0.config;
+        state.lighting.fresnel_selector = regs.lighting.config0.fresnel_selector;
+        state.lighting.bump_mode = regs.lighting.config0.bump_mode;
+        state.lighting.bump_selector = regs.lighting.config0.bump_selector;
+        state.lighting.bump_renorm = regs.lighting.config0.disable_bump_renorm == 0;
+        state.lighting.clamp_highlights = regs.lighting.config0.clamp_highlights != 0;
 
         return res;
     }
 
     bool TevStageUpdatesCombinerBufferColor(unsigned stage_index) const {
-        return (stage_index < 4) && (combiner_buffer_input & (1 << stage_index));
+        return (stage_index < 4) && (state.combiner_buffer_input & (1 << stage_index));
     }
 
     bool TevStageUpdatesCombinerBufferAlpha(unsigned stage_index) const {
-        return (stage_index < 4) && ((combiner_buffer_input >> 4) & (1 << stage_index));
+        return (stage_index < 4) && ((state.combiner_buffer_input >> 4) & (1 << stage_index));
     }
 
     bool operator ==(const PicaShaderConfig& o) const {
-        return std::memcmp(this, &o, sizeof(PicaShaderConfig)) == 0;
+        return std::memcmp(&state, &o.state, sizeof(PicaShaderConfig::State)) == 0;
+    };
+
+    // NOTE: MSVC15 (Update 2) doesn't think `delete`'d constructors and operators are TC.
+    //       This makes BitField not TC when used in a union or struct so we have to resort
+    //       to this ugly hack.
+    //       Once that bug is fixed we can use Pica::Regs::TevStageConfig here.
+    //       Doesn't include const_color because we don't sync it, see comment in CurrentConfig()
+    struct TevStageConfigRaw {
+        u32 sources_raw;
+        u32 modifiers_raw;
+        u32 ops_raw;
+        u32 scales_raw;
+        explicit operator Pica::Regs::TevStageConfig() const noexcept {
+            Pica::Regs::TevStageConfig stage;
+            stage.sources_raw = sources_raw;
+            stage.modifiers_raw = modifiers_raw;
+            stage.ops_raw = ops_raw;
+            stage.const_color = 0;
+            stage.scales_raw = scales_raw;
+            return stage;
+        }
     };
 
-    Pica::Regs::CompareFunc alpha_test_func = Pica::Regs::CompareFunc::Never;
-    std::array<Pica::Regs::TevStageConfig, 6> tev_stages = {};
-    u8 combiner_buffer_input = 0;
+    struct State {
 
-    struct {
-        struct {
-            unsigned num = 0;
-            bool directional = false;
-            bool two_sided_diffuse = false;
-            bool dist_atten_enable = false;
-            GLfloat dist_atten_scale = 0.0f;
-            GLfloat dist_atten_bias = 0.0f;
-        } light[8];
-
-        bool enable = false;
-        unsigned src_num = 0;
-        Pica::Regs::LightingBumpMode bump_mode = Pica::Regs::LightingBumpMode::None;
-        unsigned bump_selector = 0;
-        bool bump_renorm = false;
-        bool clamp_highlights = false;
-
-        Pica::Regs::LightingConfig config = Pica::Regs::LightingConfig::Config0;
-        Pica::Regs::LightingFresnelSelector fresnel_selector = Pica::Regs::LightingFresnelSelector::None;
+        Pica::Regs::CompareFunc alpha_test_func;
+        Pica::Regs::TextureConfig::TextureType texture0_type;
+        std::array<TevStageConfigRaw, 6> tev_stages;
+        u8 combiner_buffer_input;
+
+        Pica::Regs::DepthBuffering depthmap_enable;
 
         struct {
-            bool enable = false;
-            bool abs_input = false;
-            Pica::Regs::LightingLutInput type = Pica::Regs::LightingLutInput::NH;
-            float scale = 1.0f;
-        } lut_d0, lut_d1, lut_fr, lut_rr, lut_rg, lut_rb;
-    } lighting;
+            struct {
+                unsigned num;
+                bool directional;
+                bool two_sided_diffuse;
+                bool dist_atten_enable;
+            } light[8];
+
+            bool enable;
+            unsigned src_num;
+            Pica::Regs::LightingBumpMode bump_mode;
+            unsigned bump_selector;
+            bool bump_renorm;
+            bool clamp_highlights;
+
+            Pica::Regs::LightingConfig config;
+            Pica::Regs::LightingFresnelSelector fresnel_selector;
+
+            struct {
+                bool enable;
+                bool abs_input;
+                Pica::Regs::LightingLutInput type;
+                float scale;
+            } lut_d0, lut_d1, lut_fr, lut_rr, lut_rg, lut_rb;
+        } lighting;
+
+    } state;
 };
+#if (__GNUC__ >= 5) || defined(__clang__) || defined(_MSC_VER)
+static_assert(std::is_trivially_copyable<PicaShaderConfig::State>::value, "PicaShaderConfig::State must be trivially copyable");
+#endif
 
 namespace std {
 
 template <>
 struct hash<PicaShaderConfig> {
     size_t operator()(const PicaShaderConfig& k) const {
-        return Common::ComputeHash64(&k, sizeof(PicaShaderConfig));
+        return Common::ComputeHash64(&k.state, sizeof(PicaShaderConfig::State));
     }
 };
 
@@ -239,6 +280,7 @@ private:
             tex_coord1[1] = v.tc1.y.ToFloat32();
             tex_coord2[0] = v.tc2.x.ToFloat32();
             tex_coord2[1] = v.tc2.y.ToFloat32();
+            tex_coord0_w = v.tc0_w.ToFloat32();
             normquat[0] = v.quat.x.ToFloat32();
             normquat[1] = v.quat.y.ToFloat32();
             normquat[2] = v.quat.z.ToFloat32();
@@ -259,6 +301,7 @@ private:
         GLfloat tex_coord0[2];
         GLfloat tex_coord1[2];
         GLfloat tex_coord2[2];
+        GLfloat tex_coord0_w;
         GLfloat normquat[4];
         GLfloat view[3];
     };
@@ -269,6 +312,8 @@ private:
         alignas(16) GLvec3 diffuse;
         alignas(16) GLvec3 ambient;
         alignas(16) GLvec3 position;
+        GLfloat dist_atten_bias;
+        GLfloat dist_atten_scale;
     };
 
     /// Uniform structure for the Uniform Buffer Object, all members must be 16-byte aligned
@@ -277,12 +322,13 @@ private:
         GLvec4 const_color[6];
         GLvec4 tev_combiner_buffer_color;
         GLint alphatest_ref;
+        GLfloat depth_scale;
         GLfloat depth_offset;
         alignas(16) GLvec3 lighting_global_ambient;
         LightSrc light_src[8];
     };
 
-    static_assert(sizeof(UniformData) == 0x310, "The size of the UniformData structure has changed, update the structure in the shader");
+    static_assert(sizeof(UniformData) == 0x390, "The size of the UniformData structure has changed, update the structure in the shader");
     static_assert(sizeof(UniformData) < 16384, "UniformData structure must be less than 16kb as per the OpenGL spec");
 
     /// Sets the OpenGL shader in accordance with the current PICA register state
@@ -291,8 +337,11 @@ private:
     /// Syncs the cull mode to match the PICA register
     void SyncCullMode();
 
-    /// Syncs the depth scale and offset to match the PICA registers
-    void SyncDepthModifiers();
+    /// Syncs the depth scale to match the PICA register
+    void SyncDepthScale();
+
+    /// Syncs the depth offset to match the PICA register
+    void SyncDepthOffset();
 
     /// Syncs the blend enabled status to match the PICA register
     void SyncBlendEnabled();
@@ -351,6 +400,12 @@ private:
     /// Syncs the specified light's position to match the PICA register
     void SyncLightPosition(int light_index);
 
+    /// Syncs the specified light's distance attenuation bias to match the PICA register
+    void SyncLightDistanceAttenuationBias(int light_index);
+
+    /// Syncs the specified light's distance attenuation scale to match the PICA register
+    void SyncLightDistanceAttenuationScale(int light_index);
+
     OpenGLState state;
 
     RasterizerCacheOpenGL res_cache;
@@ -365,7 +420,7 @@ private:
         UniformData data;
         bool lut_dirty[6];
         bool dirty;
-    } uniform_block_data;
+    } uniform_block_data = {};
 
     std::array<SamplerInfo, 3> texture_samplers;
     OGLVertexArray vertex_array;
@@ -374,5 +429,5 @@ private:
     OGLFramebuffer framebuffer;
 
     std::array<OGLTexture, 6> lighting_luts;
-    std::array<std::array<GLvec4, 256>, 6> lighting_lut_data;
+    std::array<std::array<GLvec4, 256>, 6> lighting_lut_data{};
 };
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 9011caa39..8332e722d 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -32,8 +32,9 @@ static bool IsPassThroughTevStage(const TevStageConfig& stage) {
 }
 
 /// Writes the specified TEV stage source component(s)
-static void AppendSource(std::string& out, TevStageConfig::Source source,
+static void AppendSource(std::string& out, const PicaShaderConfig& config, TevStageConfig::Source source,
         const std::string& index_name) {
+    const auto& state = config.state;
     using Source = TevStageConfig::Source;
     switch (source) {
     case Source::PrimaryColor:
@@ -46,7 +47,20 @@ static void AppendSource(std::string& out, TevStageConfig::Source source,
         out += "secondary_fragment_color";
         break;
     case Source::Texture0:
-        out += "texture(tex[0], texcoord[0])";
+        // Only unit 0 respects the texturing type (according to 3DBrew)
+        switch(state.texture0_type) {
+        case Pica::Regs::TextureConfig::Texture2D:
+            out += "texture(tex[0], texcoord[0])";
+            break;
+        case Pica::Regs::TextureConfig::Projection2D:
+            out += "textureProj(tex[0], vec3(texcoord[0], texcoord0_w))";
+            break;
+        default:
+            out += "texture(tex[0], texcoord[0])";
+            LOG_CRITICAL(HW_GPU, "Unhandled texture type %x", static_cast<int>(state.texture0_type));
+            UNIMPLEMENTED();
+            break;
+        }
         break;
     case Source::Texture1:
         out += "texture(tex[1], texcoord[1])";
@@ -71,53 +85,53 @@ static void AppendSource(std::string& out, TevStageConfig::Source source,
 }
 
 /// Writes the color components to use for the specified TEV stage color modifier
-static void AppendColorModifier(std::string& out, TevStageConfig::ColorModifier modifier,
+static void AppendColorModifier(std::string& out, const PicaShaderConfig& config, TevStageConfig::ColorModifier modifier,
         TevStageConfig::Source source, const std::string& index_name) {
     using ColorModifier = TevStageConfig::ColorModifier;
     switch (modifier) {
     case ColorModifier::SourceColor:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".rgb";
         break;
     case ColorModifier::OneMinusSourceColor:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".rgb";
         break;
     case ColorModifier::SourceAlpha:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".aaa";
         break;
     case ColorModifier::OneMinusSourceAlpha:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".aaa";
         break;
     case ColorModifier::SourceRed:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".rrr";
         break;
     case ColorModifier::OneMinusSourceRed:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".rrr";
         break;
     case ColorModifier::SourceGreen:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".ggg";
         break;
     case ColorModifier::OneMinusSourceGreen:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".ggg";
         break;
     case ColorModifier::SourceBlue:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".bbb";
         break;
     case ColorModifier::OneMinusSourceBlue:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".bbb";
         break;
     default:
@@ -128,44 +142,44 @@ static void AppendColorModifier(std::string& out, TevStageConfig::ColorModifier
 }
 
 /// Writes the alpha component to use for the specified TEV stage alpha modifier
-static void AppendAlphaModifier(std::string& out, TevStageConfig::AlphaModifier modifier,
+static void AppendAlphaModifier(std::string& out, const PicaShaderConfig& config, TevStageConfig::AlphaModifier modifier,
         TevStageConfig::Source source, const std::string& index_name) {
     using AlphaModifier = TevStageConfig::AlphaModifier;
     switch (modifier) {
     case AlphaModifier::SourceAlpha:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".a";
         break;
     case AlphaModifier::OneMinusSourceAlpha:
         out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".a";
         break;
     case AlphaModifier::SourceRed:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".r";
         break;
     case AlphaModifier::OneMinusSourceRed:
         out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".r";
         break;
     case AlphaModifier::SourceGreen:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".g";
         break;
     case AlphaModifier::OneMinusSourceGreen:
         out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".g";
         break;
     case AlphaModifier::SourceBlue:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".b";
         break;
     case AlphaModifier::OneMinusSourceBlue:
         out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".b";
         break;
     default:
@@ -287,16 +301,16 @@ static void AppendAlphaTestCondition(std::string& out, Regs::CompareFunc func) {
 
 /// Writes the code to emulate the specified TEV stage
 static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsigned index) {
-    auto& stage = config.tev_stages[index];
+    const auto stage = static_cast<const Pica::Regs::TevStageConfig>(config.state.tev_stages[index]);
     if (!IsPassThroughTevStage(stage)) {
         std::string index_name = std::to_string(index);
 
         out += "vec3 color_results_" + index_name + "[3] = vec3[3](";
-        AppendColorModifier(out, stage.color_modifier1, stage.color_source1, index_name);
+        AppendColorModifier(out, config, stage.color_modifier1, stage.color_source1, index_name);
         out += ", ";
-        AppendColorModifier(out, stage.color_modifier2, stage.color_source2, index_name);
+        AppendColorModifier(out, config, stage.color_modifier2, stage.color_source2, index_name);
         out += ", ";
-        AppendColorModifier(out, stage.color_modifier3, stage.color_source3, index_name);
+        AppendColorModifier(out, config, stage.color_modifier3, stage.color_source3, index_name);
         out += ");\n";
 
         out += "vec3 color_output_" + index_name + " = ";
@@ -304,11 +318,11 @@ static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsi
         out += ";\n";
 
         out += "float alpha_results_" + index_name + "[3] = float[3](";
-        AppendAlphaModifier(out, stage.alpha_modifier1, stage.alpha_source1, index_name);
+        AppendAlphaModifier(out, config, stage.alpha_modifier1, stage.alpha_source1, index_name);
         out += ", ";
-        AppendAlphaModifier(out, stage.alpha_modifier2, stage.alpha_source2, index_name);
+        AppendAlphaModifier(out, config, stage.alpha_modifier2, stage.alpha_source2, index_name);
         out += ", ";
-        AppendAlphaModifier(out, stage.alpha_modifier3, stage.alpha_source3, index_name);
+        AppendAlphaModifier(out, config, stage.alpha_modifier3, stage.alpha_source3, index_name);
         out += ");\n";
 
         out += "float alpha_output_" + index_name + " = ";
@@ -331,6 +345,8 @@ static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsi
 
 /// Writes the code to emulate fragment lighting
 static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
+    const auto& lighting = config.state.lighting;
+
     // Define lighting globals
     out += "vec4 diffuse_sum = vec4(0.0, 0.0, 0.0, 1.0);\n"
            "vec4 specular_sum = vec4(0.0, 0.0, 0.0, 1.0);\n"
@@ -338,17 +354,17 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
            "vec3 refl_value = vec3(0.0);\n";
 
     // Compute fragment normals
-    if (config.lighting.bump_mode == Pica::Regs::LightingBumpMode::NormalMap) {
+    if (lighting.bump_mode == Pica::Regs::LightingBumpMode::NormalMap) {
         // Bump mapping is enabled using a normal map, read perturbation vector from the selected texture
-        std::string bump_selector = std::to_string(config.lighting.bump_selector);
+        std::string bump_selector = std::to_string(lighting.bump_selector);
         out += "vec3 surface_normal = 2.0 * texture(tex[" + bump_selector + "], texcoord[" + bump_selector + "]).rgb - 1.0;\n";
 
         // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher precision result
-        if (config.lighting.bump_renorm) {
+        if (lighting.bump_renorm) {
             std::string val = "(1.0 - (surface_normal.x*surface_normal.x + surface_normal.y*surface_normal.y))";
             out += "surface_normal.z = sqrt(max(" + val + ", 0.0));\n";
         }
-    } else if (config.lighting.bump_mode == Pica::Regs::LightingBumpMode::TangentMap) {
+    } else if (lighting.bump_mode == Pica::Regs::LightingBumpMode::TangentMap) {
         // Bump mapping is enabled using a tangent map
         LOG_CRITICAL(HW_GPU, "unimplemented bump mapping mode (tangent mapping)");
         UNIMPLEMENTED();
@@ -361,7 +377,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
     out += "vec3 normal = normalize(quaternion_rotate(normquat, surface_normal));\n";
 
     // Gets the index into the specified lookup table for specular lighting
-    auto GetLutIndex = [config](unsigned light_num, Regs::LightingLutInput input, bool abs) {
+    auto GetLutIndex = [&lighting](unsigned light_num, Regs::LightingLutInput input, bool abs) {
         const std::string half_angle = "normalize(normalize(view) + light_vector)";
         std::string index;
         switch (input) {
@@ -389,7 +405,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
 
         if (abs) {
             // LUT index is in the range of (0.0, 1.0)
-            index = config.lighting.light[light_num].two_sided_diffuse ? "abs(" + index + ")" : "max(" + index + ", 0.f)";
+            index = lighting.light[light_num].two_sided_diffuse ? "abs(" + index + ")" : "max(" + index + ", 0.f)";
             return "(FLOAT_255 * clamp(" + index + ", 0.0, 1.0))";
         } else {
             // LUT index is in the range of (-1.0, 1.0)
@@ -407,8 +423,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
     };
 
     // Write the code to emulate each enabled light
-    for (unsigned light_index = 0; light_index < config.lighting.src_num; ++light_index) {
-        const auto& light_config = config.lighting.light[light_index];
+    for (unsigned light_index = 0; light_index < lighting.src_num; ++light_index) {
+        const auto& light_config = lighting.light[light_index];
         std::string light_src = "light_src[" + std::to_string(light_config.num) + "]";
 
         // Compute light vector (directional or positional)
@@ -423,48 +439,46 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
         // If enabled, compute distance attenuation value
         std::string dist_atten = "1.0";
         if (light_config.dist_atten_enable) {
-            std::string scale = std::to_string(light_config.dist_atten_scale);
-            std::string bias = std::to_string(light_config.dist_atten_bias);
-            std::string index = "(" + scale + " * length(-view - " + light_src + ".position) + " + bias + ")";
+            std::string index = "(" + light_src + ".dist_atten_scale * length(-view - " + light_src + ".position) + " + light_src + ".dist_atten_bias)";
             index = "((clamp(" + index + ", 0.0, FLOAT_255)))";
             const unsigned lut_num = ((unsigned)Regs::LightingSampler::DistanceAttenuation + light_config.num);
             dist_atten = GetLutValue((Regs::LightingSampler)lut_num, index);
         }
 
         // If enabled, clamp specular component if lighting result is negative
-        std::string clamp_highlights = config.lighting.clamp_highlights ? "(dot(light_vector, normal) <= 0.0 ? 0.0 : 1.0)" : "1.0";
+        std::string clamp_highlights = lighting.clamp_highlights ? "(dot(light_vector, normal) <= 0.0 ? 0.0 : 1.0)" : "1.0";
 
         // Specular 0 component
         std::string d0_lut_value = "1.0";
-        if (config.lighting.lut_d0.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Distribution0)) {
+        if (lighting.lut_d0.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::Distribution0)) {
             // Lookup specular "distribution 0" LUT value
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_d0.type, config.lighting.lut_d0.abs_input);
-            d0_lut_value = "(" + std::to_string(config.lighting.lut_d0.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution0, index) + ")";
+            std::string index = GetLutIndex(light_config.num, lighting.lut_d0.type, lighting.lut_d0.abs_input);
+            d0_lut_value = "(" + std::to_string(lighting.lut_d0.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution0, index) + ")";
         }
         std::string specular_0 = "(" + d0_lut_value + " * " + light_src + ".specular_0)";
 
         // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
-        if (config.lighting.lut_rr.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectRed)) {
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rr.type, config.lighting.lut_rr.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_rr.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectRed, index) + ")";
+        if (lighting.lut_rr.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::ReflectRed)) {
+            std::string index = GetLutIndex(light_config.num, lighting.lut_rr.type, lighting.lut_rr.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_rr.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectRed, index) + ")";
             out += "refl_value.r = " + value + ";\n";
         } else {
             out += "refl_value.r = 1.0;\n";
         }
 
         // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
-        if (config.lighting.lut_rg.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectGreen)) {
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rg.type, config.lighting.lut_rg.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_rg.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectGreen, index) + ")";
+        if (lighting.lut_rg.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::ReflectGreen)) {
+            std::string index = GetLutIndex(light_config.num, lighting.lut_rg.type, lighting.lut_rg.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_rg.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectGreen, index) + ")";
             out += "refl_value.g = " + value + ";\n";
         } else {
             out += "refl_value.g = refl_value.r;\n";
         }
 
         // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
-        if (config.lighting.lut_rb.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectBlue)) {
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rb.type, config.lighting.lut_rb.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_rb.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectBlue, index) + ")";
+        if (lighting.lut_rb.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::ReflectBlue)) {
+            std::string index = GetLutIndex(light_config.num, lighting.lut_rb.type, lighting.lut_rb.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_rb.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectBlue, index) + ")";
             out += "refl_value.b = " + value + ";\n";
         } else {
             out += "refl_value.b = refl_value.r;\n";
@@ -472,27 +486,27 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
 
         // Specular 1 component
         std::string d1_lut_value = "1.0";
-        if (config.lighting.lut_d1.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Distribution1)) {
+        if (lighting.lut_d1.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::Distribution1)) {
             // Lookup specular "distribution 1" LUT value
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_d1.type, config.lighting.lut_d1.abs_input);
-            d1_lut_value = "(" + std::to_string(config.lighting.lut_d1.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution1, index) + ")";
+            std::string index = GetLutIndex(light_config.num, lighting.lut_d1.type, lighting.lut_d1.abs_input);
+            d1_lut_value = "(" + std::to_string(lighting.lut_d1.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution1, index) + ")";
         }
         std::string specular_1 = "(" + d1_lut_value + " * refl_value * " + light_src + ".specular_1)";
 
         // Fresnel
-        if (config.lighting.lut_fr.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Fresnel)) {
+        if (lighting.lut_fr.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::Fresnel)) {
             // Lookup fresnel LUT value
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_fr.type, config.lighting.lut_fr.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_fr.scale) + " * " + GetLutValue(Regs::LightingSampler::Fresnel, index) + ")";
+            std::string index = GetLutIndex(light_config.num, lighting.lut_fr.type, lighting.lut_fr.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_fr.scale) + " * " + GetLutValue(Regs::LightingSampler::Fresnel, index) + ")";
 
             // Enabled for difffuse lighting alpha component
-            if (config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::PrimaryAlpha ||
-                config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
+            if (lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::PrimaryAlpha ||
+                lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
                 out += "diffuse_sum.a  *= " + value + ";\n";
 
             // Enabled for the specular lighting alpha component
-            if (config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::SecondaryAlpha ||
-                config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
+            if (lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::SecondaryAlpha ||
+                lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
                 out += "specular_sum.a *= " + value + ";\n";
         }
 
@@ -510,6 +524,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
 }
 
 std::string GenerateFragmentShader(const PicaShaderConfig& config) {
+    const auto& state = config.state;
+
     std::string out = R"(
 #version 330 core
 #define NUM_TEV_STAGES 6
@@ -519,6 +535,7 @@ std::string GenerateFragmentShader(const PicaShaderConfig& config) {
 
 in vec4 primary_color;
 in vec2 texcoord[3];
+in float texcoord0_w;
 in vec4 normquat;
 in vec3 view;
 
@@ -530,12 +547,15 @@ struct LightSrc {
     vec3 diffuse;
     vec3 ambient;
     vec3 position;
+    float dist_atten_bias;
+    float dist_atten_scale;
 };
 
 layout (std140) uniform shader_data {
     vec4 const_color[NUM_TEV_STAGES];
     vec4 tev_combiner_buffer_color;
     int alphatest_ref;
+    float depth_scale;
     float depth_offset;
     vec3 lighting_global_ambient;
     LightSrc light_src[NUM_LIGHTS];
@@ -555,29 +575,37 @@ vec4 secondary_fragment_color = vec4(0.0);
 )";
 
     // Do not do any sort of processing if it's obvious we're not going to pass the alpha test
-    if (config.alpha_test_func == Regs::CompareFunc::Never) {
+    if (state.alpha_test_func == Regs::CompareFunc::Never) {
         out += "discard; }";
         return out;
     }
 
-    if (config.lighting.enable)
+    if (state.lighting.enable)
         WriteLighting(out, config);
 
     out += "vec4 combiner_buffer = vec4(0.0);\n";
     out += "vec4 next_combiner_buffer = tev_combiner_buffer_color;\n";
     out += "vec4 last_tex_env_out = vec4(0.0);\n";
 
-    for (size_t index = 0; index < config.tev_stages.size(); ++index)
+    for (size_t index = 0; index < state.tev_stages.size(); ++index)
         WriteTevStage(out, config, (unsigned)index);
 
-    if (config.alpha_test_func != Regs::CompareFunc::Always) {
+    if (state.alpha_test_func != Regs::CompareFunc::Always) {
         out += "if (";
-        AppendAlphaTestCondition(out, config.alpha_test_func);
+        AppendAlphaTestCondition(out, state.alpha_test_func);
         out += ") discard;\n";
     }
 
     out += "color = last_tex_env_out;\n";
-    out += "gl_FragDepth = gl_FragCoord.z + depth_offset;\n}";
+
+    out += "float z_over_w = 1.0 - gl_FragCoord.z * 2.0;\n";
+    out += "float depth = z_over_w * depth_scale + depth_offset;\n";
+    if (state.depthmap_enable == Pica::Regs::DepthBuffering::WBuffering) {
+        out += "depth /= gl_FragCoord.w;\n";
+    }
+    out += "gl_FragDepth = depth;\n";
+
+    out += "}";
 
     return out;
 }
@@ -585,17 +613,19 @@ vec4 secondary_fragment_color = vec4(0.0);
 std::string GenerateVertexShader() {
     std::string out = "#version 330 core\n";
 
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_POSITION)  + ") in vec4 vert_position;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_COLOR)     + ") in vec4 vert_color;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0) + ") in vec2 vert_texcoord0;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD1) + ") in vec2 vert_texcoord1;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD2) + ") in vec2 vert_texcoord2;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_NORMQUAT)  + ") in vec4 vert_normquat;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_VIEW)      + ") in vec3 vert_view;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_POSITION)    + ") in vec4 vert_position;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_COLOR)       + ") in vec4 vert_color;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0)   + ") in vec2 vert_texcoord0;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD1)   + ") in vec2 vert_texcoord1;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD2)   + ") in vec2 vert_texcoord2;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0_W) + ") in float vert_texcoord0_w;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_NORMQUAT)    + ") in vec4 vert_normquat;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_VIEW)        + ") in vec3 vert_view;\n";
 
     out += R"(
 out vec4 primary_color;
 out vec2 texcoord[3];
+out float texcoord0_w;
 out vec4 normquat;
 out vec3 view;
 
@@ -604,6 +634,7 @@ void main() {
     texcoord[0] = vert_texcoord0;
     texcoord[1] = vert_texcoord1;
     texcoord[2] = vert_texcoord2;
+    texcoord0_w = vert_texcoord0_w;
     normquat = vert_normquat;
     view = vert_view;
     gl_Position = vec4(vert_position.x, vert_position.y, -vert_position.z, vert_position.w);
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index 3eb07d57a..bef3249cf 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -6,7 +6,7 @@
 
 #include <string>
 
-struct PicaShaderConfig;
+union PicaShaderConfig;
 
 namespace GLShader {
 
diff --git a/src/video_core/renderer_opengl/gl_shader_util.h b/src/video_core/renderer_opengl/gl_shader_util.h
index 097242f6f..f59912f79 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.h
+++ b/src/video_core/renderer_opengl/gl_shader_util.h
@@ -14,6 +14,7 @@ enum Attributes {
     ATTRIBUTE_TEXCOORD0,
     ATTRIBUTE_TEXCOORD1,
     ATTRIBUTE_TEXCOORD2,
+    ATTRIBUTE_TEXCOORD0_W,
     ATTRIBUTE_NORMQUAT,
     ATTRIBUTE_VIEW,
 };
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 02cd9f417..fa141fc9a 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -36,6 +36,8 @@ OpenGLState::OpenGLState() {
     stencil.action_stencil_fail = GL_KEEP;
 
     blend.enabled = false;
+    blend.rgb_equation = GL_FUNC_ADD;
+    blend.a_equation = GL_FUNC_ADD;
     blend.src_rgb_func = GL_ONE;
     blend.dst_rgb_func = GL_ZERO;
     blend.src_a_func = GL_ONE;
@@ -165,6 +167,11 @@ void OpenGLState::Apply() const {
                             blend.src_a_func, blend.dst_a_func);
     }
 
+    if (blend.rgb_equation != cur_state.blend.rgb_equation ||
+            blend.a_equation != cur_state.blend.a_equation) {
+        glBlendEquationSeparate(blend.rgb_equation, blend.a_equation);
+    }
+
     if (logic_op != cur_state.logic_op) {
         glLogicOp(logic_op);
     }
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 24f20e47c..228727054 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -40,6 +40,8 @@ public:
 
     struct {
         bool enabled; // GL_BLEND
+        GLenum rgb_equation; // GL_BLEND_EQUATION_RGB
+        GLenum a_equation; // GL_BLEND_EQUATION_ALPHA
         GLenum src_rgb_func; // GL_BLEND_SRC_RGB
         GLenum dst_rgb_func; // GL_BLEND_DST_RGB
         GLenum src_a_func; // GL_BLEND_SRC_ALPHA
diff --git a/src/video_core/renderer_opengl/pica_to_gl.h b/src/video_core/renderer_opengl/pica_to_gl.h
index 976d1f364..6dc2758c5 100644
--- a/src/video_core/renderer_opengl/pica_to_gl.h
+++ b/src/video_core/renderer_opengl/pica_to_gl.h
@@ -78,6 +78,26 @@ inline GLenum WrapMode(Pica::Regs::TextureConfig::WrapMode mode) {
     return gl_mode;
 }
 
+inline GLenum BlendEquation(Pica::Regs::BlendEquation equation) {
+    static const GLenum blend_equation_table[] = {
+        GL_FUNC_ADD,              // BlendEquation::Add
+        GL_FUNC_SUBTRACT,         // BlendEquation::Subtract
+        GL_FUNC_REVERSE_SUBTRACT, // BlendEquation::ReverseSubtract
+        GL_MIN,                   // BlendEquation::Min
+        GL_MAX,                   // BlendEquation::Max
+    };
+
+    // Range check table for input
+    if (static_cast<size_t>(equation) >= ARRAY_SIZE(blend_equation_table)) {
+        LOG_CRITICAL(Render_OpenGL, "Unknown blend equation %d", equation);
+        UNREACHABLE();
+
+        return GL_FUNC_ADD;
+    }
+
+    return blend_equation_table[(unsigned)equation];
+}
+
 inline GLenum BlendFunc(Pica::Regs::BlendFactor factor) {
     static const GLenum blend_func_table[] = {
         GL_ZERO,                     // BlendFactor::Zero
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 0e9a0be8b..8410e0a64 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -192,7 +192,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const GPU::Regs::FramebufferConfig& fram
     // only allows rows to have a memory alignement of 4.
     ASSERT(pixel_stride % 4 == 0);
 
-    if (!Rasterizer()->AccelerateDisplay(framebuffer, framebuffer_addr, pixel_stride, screen_info)) {
+    if (!Rasterizer()->AccelerateDisplay(framebuffer, framebuffer_addr, static_cast<u32>(pixel_stride), screen_info)) {
         // Reset the screen info's display texture to its own permanent texture
         screen_info.display_texture = screen_info.texture.resource.handle;
         screen_info.display_texcoords = MathUtil::Rectangle<float>(0.f, 0.f, 1.f, 1.f);
@@ -450,7 +450,7 @@ static const char* GetType(GLenum type) {
 #undef RET
 }
 
-static void DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length,
+static void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length,
                          const GLchar* message, const void* user_param) {
     Log::Level level;
     switch (severity) {
@@ -473,12 +473,6 @@ static void DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severity,
 bool RendererOpenGL::Init() {
     render_window->MakeCurrent();
 
-    // TODO: Make frontends initialize this, so they can use gladLoadGLLoader with their own loaders
-    if (!gladLoadGL()) {
-        LOG_CRITICAL(Render_OpenGL, "Failed to initialize GL functions! Exiting...");
-        exit(-1);
-    }
-
     if (GLAD_GL_KHR_debug) {
         glEnable(GL_DEBUG_OUTPUT);
         glDebugMessageCallback(DebugHandler, nullptr);
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 65dcc9156..f565e2c91 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -30,65 +30,7 @@ namespace Pica {
 
 namespace Shader {
 
-#ifdef ARCHITECTURE_x86_64
-static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map;
-static const JitShader* jit_shader;
-#endif // ARCHITECTURE_x86_64
-
-void Setup() {
-#ifdef ARCHITECTURE_x86_64
-    if (VideoCore::g_shader_jit_enabled) {
-        u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^
-            Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)));
-
-        auto iter = shader_map.find(cache_key);
-        if (iter != shader_map.end()) {
-            jit_shader = iter->second.get();
-        } else {
-            auto shader = std::make_unique<JitShader>();
-            shader->Compile();
-            jit_shader = shader.get();
-            shader_map[cache_key] = std::move(shader);
-        }
-    }
-#endif // ARCHITECTURE_x86_64
-}
-
-void Shutdown() {
-#ifdef ARCHITECTURE_x86_64
-    shader_map.clear();
-#endif // ARCHITECTURE_x86_64
-}
-
-MICROPROFILE_DEFINE(GPU_VertexShader, "GPU", "Vertex Shader", MP_RGB(50, 50, 240));
-
-OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
-    auto& config = g_state.regs.vs;
-
-    MICROPROFILE_SCOPE(GPU_VertexShader);
-
-    state.program_counter = config.main_offset;
-    state.debug.max_offset = 0;
-    state.debug.max_opdesc_id = 0;
-
-    // Setup input register table
-    const auto& attribute_register_map = config.input_register_map;
-
-    for (unsigned i = 0; i < num_attributes; i++)
-         state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
-
-    state.conditional_code[0] = false;
-    state.conditional_code[1] = false;
-
-#ifdef ARCHITECTURE_x86_64
-    if (VideoCore::g_shader_jit_enabled)
-        jit_shader->Run(&state.registers, g_state.regs.vs.main_offset);
-    else
-        RunInterpreter(state);
-#else
-    RunInterpreter(state);
-#endif // ARCHITECTURE_x86_64
-
+OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) {
     // Setup output data
     OutputVertex ret;
     // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
@@ -99,10 +41,10 @@ OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attr
         if (index >= g_state.regs.vs_output_total)
             break;
 
-        if ((g_state.regs.vs.output_mask & (1 << i)) == 0)
+        if ((config.output_mask & (1 << i)) == 0)
             continue;
 
-        const auto& output_register_map = g_state.regs.vs_output_attributes[index]; // TODO: Don't hardcode VS here
+        const auto& output_register_map = g_state.regs.vs_output_attributes[index];
 
         u32 semantics[4] = {
             output_register_map.map_x, output_register_map.map_y,
@@ -112,7 +54,7 @@ OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attr
         for (unsigned comp = 0; comp < 4; ++comp) {
             float24* out = ((float24*)&ret) + semantics[comp];
             if (semantics[comp] != Regs::VSOutputAttributes::INVALID) {
-                *out = state.registers.output[i][comp];
+                *out = value[i][comp];
             } else {
                 // Zero output so that attributes which aren't output won't have denormals in them,
                 // which would slow us down later.
@@ -140,10 +82,70 @@ OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attr
     return ret;
 }
 
-DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) {
+#ifdef ARCHITECTURE_x86_64
+static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map;
+static const JitShader* jit_shader;
+#endif // ARCHITECTURE_x86_64
+
+void ClearCache() {
+#ifdef ARCHITECTURE_x86_64
+    shader_map.clear();
+#endif // ARCHITECTURE_x86_64
+}
+
+void ShaderSetup::Setup() {
+#ifdef ARCHITECTURE_x86_64
+    if (VideoCore::g_shader_jit_enabled) {
+        u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^
+            Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)));
+
+        auto iter = shader_map.find(cache_key);
+        if (iter != shader_map.end()) {
+            jit_shader = iter->second.get();
+        } else {
+            auto shader = std::make_unique<JitShader>();
+            shader->Compile();
+            jit_shader = shader.get();
+            shader_map[cache_key] = std::move(shader);
+        }
+    }
+#endif // ARCHITECTURE_x86_64
+}
+
+MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
+
+void ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
+    auto& config = g_state.regs.vs;
+    auto& setup = g_state.vs;
+
+    MICROPROFILE_SCOPE(GPU_Shader);
+
+    state.debug.max_offset = 0;
+    state.debug.max_opdesc_id = 0;
+
+    // Setup input register table
+    const auto& attribute_register_map = config.input_register_map;
+
+    for (unsigned i = 0; i < num_attributes; i++)
+         state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
+
+    state.conditional_code[0] = false;
+    state.conditional_code[1] = false;
+
+#ifdef ARCHITECTURE_x86_64
+    if (VideoCore::g_shader_jit_enabled)
+        jit_shader->Run(setup, state, config.main_offset);
+    else
+        RunInterpreter(setup, state, config.main_offset);
+#else
+    RunInterpreter(setup, state, config.main_offset);
+#endif // ARCHITECTURE_x86_64
+
+}
+
+DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) {
     UnitState<true> state;
 
-    state.program_counter = config.main_offset;
     state.debug.max_offset = 0;
     state.debug.max_opdesc_id = 0;
 
@@ -158,7 +160,7 @@ DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, c
     state.conditional_code[0] = false;
     state.conditional_code[1] = false;
 
-    RunInterpreter(state);
+    RunInterpreter(setup, state, config.main_offset);
     return state.debug;
 }
 
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 56b83bfeb..fee16df62 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -43,7 +43,8 @@ struct OutputVertex {
     Math::Vec4<float24> color;
     Math::Vec2<float24> tc0;
     Math::Vec2<float24> tc1;
-    INSERT_PADDING_WORDS(2);
+    float24 tc0_w;
+    INSERT_PADDING_WORDS(1);
     Math::Vec3<float24> view;
     INSERT_PADDING_WORDS(1);
     Math::Vec2<float24> tc2;
@@ -83,22 +84,14 @@ struct OutputVertex {
 static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
 static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
 
-/// Vertex shader memory
-struct ShaderSetup {
-    struct {
-        // The float uniforms are accessed by the shader JIT using SSE instructions, and are
-        // therefore required to be 16-byte aligned.
-        alignas(16) Math::Vec4<float24> f[96];
-
-        std::array<bool, 16> b;
-        std::array<Math::Vec4<u8>, 4> i;
-    } uniforms;
+struct OutputRegisters {
+    OutputRegisters() = default;
 
-    Math::Vec4<float24> default_attributes[16];
+    alignas(16) Math::Vec4<float24> value[16];
 
-    std::array<u32, 1024> program_code;
-    std::array<u32, 1024> swizzle_data;
+    OutputVertex ToVertex(const Regs::ShaderConfig& config);
 };
+static_assert(std::is_pod<OutputRegisters>::value, "Structure is not POD");
 
 // Helper structure used to keep track of data useful for inspection of shader emulation
 template<bool full_debugging>
@@ -283,43 +276,27 @@ struct UnitState {
         // The registers are accessed by the shader JIT using SSE instructions, and are therefore
         // required to be 16-byte aligned.
         alignas(16) Math::Vec4<float24> input[16];
-        alignas(16) Math::Vec4<float24> output[16];
         alignas(16) Math::Vec4<float24> temporary[16];
     } registers;
     static_assert(std::is_pod<Registers>::value, "Structure is not POD");
 
-    u32 program_counter;
+    OutputRegisters output_registers;
+
     bool conditional_code[2];
 
     // Two Address registers and one loop counter
     // TODO: How many bits do these actually have?
     s32 address_registers[3];
 
-    enum {
-        INVALID_ADDRESS = 0xFFFFFFFF
-    };
-
-    struct CallStackElement {
-        u32 final_address;  // Address upon which we jump to return_address
-        u32 return_address; // Where to jump when leaving scope
-        u8 repeat_counter;  // How often to repeat until this call stack element is removed
-        u8 loop_increment;  // Which value to add to the loop counter after an iteration
-                            // TODO: Should this be a signed value? Does it even matter?
-        u32 loop_address;   // The address where we'll return to after each loop iteration
-    };
-
-    // TODO: Is there a maximal size for this?
-    boost::container::static_vector<CallStackElement, 16> call_stack;
-
     DebugData<Debug> debug;
 
     static size_t InputOffset(const SourceRegister& reg) {
         switch (reg.GetRegisterType()) {
         case RegisterType::Input:
-            return offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState, registers.input) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
 
         case RegisterType::Temporary:
-            return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
 
         default:
             UNREACHABLE();
@@ -330,10 +307,10 @@ struct UnitState {
     static size_t OutputOffset(const DestRegister& reg) {
         switch (reg.GetRegisterType()) {
         case RegisterType::Output:
-            return offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState, output_registers.value) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
 
         case RegisterType::Temporary:
-            return offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
+            return offsetof(UnitState, registers.temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>);
 
         default:
             UNREACHABLE();
@@ -342,33 +319,65 @@ struct UnitState {
     }
 };
 
-/**
- * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per
- * vertex, which would happen within the `Run` function).
- */
-void Setup();
+/// Clears the shader cache
+void ClearCache();
 
-/// Performs any cleanup when the emulator is shutdown
-void Shutdown();
+struct ShaderSetup {
 
-/**
- * Runs the currently setup shader
- * @param state Shader unit state, must be setup per shader and per shader unit
- * @param input Input vertex into the shader
- * @param num_attributes The number of vertex shader attributes
- * @return The output vertex, after having been processed by the vertex shader
- */
-OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes);
+    struct {
+        // The float uniforms are accessed by the shader JIT using SSE instructions, and are
+        // therefore required to be 16-byte aligned.
+        alignas(16) Math::Vec4<float24> f[96];
 
-/**
- * Produce debug information based on the given shader and input vertex
- * @param input Input vertex into the shader
- * @param num_attributes The number of vertex shader attributes
- * @param config Configuration object for the shader pipeline
- * @param setup Setup object for the shader pipeline
- * @return Debug information for this shader with regards to the given vertex
- */
-DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup);
+        std::array<bool, 16> b;
+        std::array<Math::Vec4<u8>, 4> i;
+    } uniforms;
+
+    static size_t UniformOffset(RegisterType type, unsigned index) {
+        switch (type) {
+        case RegisterType::FloatUniform:
+            return offsetof(ShaderSetup, uniforms.f) + index*sizeof(Math::Vec4<float24>);
+
+        case RegisterType::BoolUniform:
+            return offsetof(ShaderSetup, uniforms.b) + index*sizeof(bool);
+
+        case RegisterType::IntUniform:
+            return offsetof(ShaderSetup, uniforms.i) + index*sizeof(Math::Vec4<u8>);
+
+        default:
+            UNREACHABLE();
+            return 0;
+        }
+    }
+
+    std::array<u32, 1024> program_code;
+    std::array<u32, 1024> swizzle_data;
+
+    /**
+     * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per
+     * vertex, which would happen within the `Run` function).
+     */
+    void Setup();
+
+    /**
+     * Runs the currently setup shader
+     * @param state Shader unit state, must be setup per shader and per shader unit
+     * @param input Input vertex into the shader
+     * @param num_attributes The number of vertex shader attributes
+     */
+    void Run(UnitState<false>& state, const InputVertex& input, int num_attributes);
+
+    /**
+     * Produce debug information based on the given shader and input vertex
+     * @param input Input vertex into the shader
+     * @param num_attributes The number of vertex shader attributes
+     * @param config Configuration object for the shader pipeline
+     * @param setup Setup object for the shader pipeline
+     * @return Debug information for this shader with regards to the given vertex
+     */
+    DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup);
+
+};
 
 } // namespace Shader
 
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index 7710f7fbc..b1eadc071 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -29,8 +29,24 @@ namespace Pica {
 
 namespace Shader {
 
+constexpr u32 INVALID_ADDRESS = 0xFFFFFFFF;
+
+struct CallStackElement {
+    u32 final_address;  // Address upon which we jump to return_address
+    u32 return_address; // Where to jump when leaving scope
+    u8 repeat_counter;  // How often to repeat until this call stack element is removed
+    u8 loop_increment;  // Which value to add to the loop counter after an iteration
+                        // TODO: Should this be a signed value? Does it even matter?
+    u32 loop_address;   // The address where we'll return to after each loop iteration
+};
+
 template<bool Debug>
-void RunInterpreter(UnitState<Debug>& state) {
+void RunInterpreter(const ShaderSetup& setup, UnitState<Debug>& state, unsigned offset) {
+    // TODO: Is there a maximal size for this?
+    boost::container::static_vector<CallStackElement, 16> call_stack;
+
+    u32 program_counter = offset;
+
     const auto& uniforms = g_state.vs.uniforms;
     const auto& swizzle_data = g_state.vs.swizzle_data;
     const auto& program_code = g_state.vs.program_code;
@@ -41,16 +57,16 @@ void RunInterpreter(UnitState<Debug>& state) {
     unsigned iteration = 0;
     bool exit_loop = false;
     while (!exit_loop) {
-        if (!state.call_stack.empty()) {
-            auto& top = state.call_stack.back();
-            if (state.program_counter == top.final_address) {
+        if (!call_stack.empty()) {
+            auto& top = call_stack.back();
+            if (program_counter == top.final_address) {
                 state.address_registers[2] += top.loop_increment;
 
                 if (top.repeat_counter-- == 0) {
-                    state.program_counter = top.return_address;
-                    state.call_stack.pop_back();
+                    program_counter = top.return_address;
+                    call_stack.pop_back();
                 } else {
-                    state.program_counter = top.loop_address;
+                    program_counter = top.loop_address;
                 }
 
                 // TODO: Is "trying again" accurate to hardware?
@@ -58,20 +74,20 @@ void RunInterpreter(UnitState<Debug>& state) {
             }
         }
 
-        const Instruction instr = { program_code[state.program_counter] };
+        const Instruction instr = { program_code[program_counter] };
         const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] };
 
-        static auto call = [](UnitState<Debug>& state, u32 offset, u32 num_instructions,
+        static auto call = [&program_counter, &call_stack](UnitState<Debug>& state, u32 offset, u32 num_instructions,
                               u32 return_offset, u8 repeat_count, u8 loop_increment) {
-            state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
-            ASSERT(state.call_stack.size() < state.call_stack.capacity());
-            state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
+            program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
+            ASSERT(call_stack.size() < call_stack.capacity());
+            call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
         };
-        Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, state.program_counter);
+        Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, program_counter);
         if (iteration > 0)
-            Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, state.program_counter);
+            Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, program_counter);
 
-        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + state.program_counter);
+        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + program_counter);
 
         auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
             switch (source_reg.GetRegisterType()) {
@@ -128,7 +144,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                 src2[3] = src2[3] * float24::FromFloat32(-1);
             }
 
-            float24* dest = (instr.common.dest.Value() < 0x10) ? &state.registers.output[instr.common.dest.Value().GetIndex()][0]
+            float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers.value[instr.common.dest.Value().GetIndex()][0]
                         : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0]
                         : dummy_vec4_float24;
 
@@ -467,7 +483,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                     src3[3] = src3[3] * float24::FromFloat32(-1);
                 }
 
-                float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0]
+                float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers.value[instr.mad.dest.Value().GetIndex()][0]
                             : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
                             : dummy_vec4_float24;
 
@@ -519,7 +535,7 @@ void RunInterpreter(UnitState<Debug>& state) {
             case OpCode::Id::JMPC:
                 Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
                 if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
-                    state.program_counter = instr.flow_control.dest_offset - 1;
+                    program_counter = instr.flow_control.dest_offset - 1;
                 }
                 break;
 
@@ -527,7 +543,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                 Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
 
                 if (uniforms.b[instr.flow_control.bool_uniform_id] == !(instr.flow_control.num_instructions & 1)) {
-                    state.program_counter = instr.flow_control.dest_offset - 1;
+                    program_counter = instr.flow_control.dest_offset - 1;
                 }
                 break;
 
@@ -535,7 +551,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                 call(state,
                      instr.flow_control.dest_offset,
                      instr.flow_control.num_instructions,
-                     state.program_counter + 1, 0, 0);
+                     program_counter + 1, 0, 0);
                 break;
 
             case OpCode::Id::CALLU:
@@ -544,7 +560,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                     call(state,
                         instr.flow_control.dest_offset,
                         instr.flow_control.num_instructions,
-                        state.program_counter + 1, 0, 0);
+                        program_counter + 1, 0, 0);
                 }
                 break;
 
@@ -554,7 +570,7 @@ void RunInterpreter(UnitState<Debug>& state) {
                     call(state,
                         instr.flow_control.dest_offset,
                         instr.flow_control.num_instructions,
-                        state.program_counter + 1, 0, 0);
+                        program_counter + 1, 0, 0);
                 }
                 break;
 
@@ -565,8 +581,8 @@ void RunInterpreter(UnitState<Debug>& state) {
                 Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
                 if (uniforms.b[instr.flow_control.bool_uniform_id]) {
                     call(state,
-                         state.program_counter + 1,
-                         instr.flow_control.dest_offset - state.program_counter - 1,
+                         program_counter + 1,
+                         instr.flow_control.dest_offset - program_counter - 1,
                          instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 } else {
                     call(state,
@@ -584,8 +600,8 @@ void RunInterpreter(UnitState<Debug>& state) {
                 Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
                 if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
                     call(state,
-                         state.program_counter + 1,
-                         instr.flow_control.dest_offset - state.program_counter - 1,
+                         program_counter + 1,
+                         instr.flow_control.dest_offset - program_counter - 1,
                          instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 } else {
                     call(state,
@@ -607,8 +623,8 @@ void RunInterpreter(UnitState<Debug>& state) {
 
                 Record<DebugDataRecord::LOOP_INT_IN>(state.debug, iteration, loop_param);
                 call(state,
-                     state.program_counter + 1,
-                     instr.flow_control.dest_offset - state.program_counter + 1,
+                     program_counter + 1,
+                     instr.flow_control.dest_offset - program_counter + 1,
                      instr.flow_control.dest_offset + 1,
                      loop_param.x,
                      loop_param.z);
@@ -625,14 +641,14 @@ void RunInterpreter(UnitState<Debug>& state) {
         }
         }
 
-        ++state.program_counter;
+        ++program_counter;
         ++iteration;
     }
 }
 
 // Explicit instantiation
-template void RunInterpreter(UnitState<false>& state);
-template void RunInterpreter(UnitState<true>& state);
+template void RunInterpreter(const ShaderSetup& setup, UnitState<false>& state, unsigned offset);
+template void RunInterpreter(const ShaderSetup& setup, UnitState<true>& state, unsigned offset);
 
 } // namespace
 
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
index 6048cdf3a..bb3ce1c6e 100644
--- a/src/video_core/shader/shader_interpreter.h
+++ b/src/video_core/shader/shader_interpreter.h
@@ -11,7 +11,7 @@ namespace Shader {
 template <bool Debug> struct UnitState;
 
 template<bool Debug>
-void RunInterpreter(UnitState<Debug>& state);
+void RunInterpreter(const ShaderSetup& setup, UnitState<Debug>& state, unsigned offset);
 
 } // namespace
 
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index 99f6c51eb..43e7e6b4c 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -102,7 +102,7 @@ const JitFunction instr_table[64] = {
 // purposes, as documented below:
 
 /// Pointer to the uniform memory
-static const X64Reg UNIFORMS = R9;
+static const X64Reg SETUP = R9;
 /// The two 32-bit VS address offset registers set by the MOVA instruction
 static const X64Reg ADDROFFS_REG_0 = R10;
 static const X64Reg ADDROFFS_REG_1 = R11;
@@ -117,7 +117,7 @@ static const X64Reg COND0 = R13;
 /// Result of the previous CMP instruction for the Y-component comparison
 static const X64Reg COND1 = R14;
 /// Pointer to the UnitState instance for the current VS unit
-static const X64Reg REGISTERS = R15;
+static const X64Reg STATE = R15;
 /// SIMD scratch register
 static const X64Reg SCRATCH = XMM0;
 /// Loaded with the first swizzled source register, otherwise can be used as a scratch register
@@ -136,7 +136,7 @@ static const X64Reg NEGBIT = XMM15;
 // State registers that must not be modified by external functions calls
 // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed
 static const BitSet32 persistent_regs = {
-    UNIFORMS, REGISTERS, // Pointers to register blocks
+    SETUP, STATE, // Pointers to register blocks
     ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, // Cached registers
     ONE+16, NEGBIT+16, // Constants
 };
@@ -177,10 +177,10 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe
     size_t src_offset;
 
     if (src_reg.GetRegisterType() == RegisterType::FloatUniform) {
-        src_ptr = UNIFORMS;
-        src_offset = src_reg.GetIndex() * sizeof(float24) * 4;
+        src_ptr = SETUP;
+        src_offset = ShaderSetup::UniformOffset(RegisterType::FloatUniform, src_reg.GetIndex());
     } else {
-        src_ptr = REGISTERS;
+        src_ptr = STATE;
         src_offset = UnitState<false>::InputOffset(src_reg);
     }
 
@@ -264,11 +264,11 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) {
     // If all components are enabled, write the result to the destination register
     if (swiz.dest_mask == NO_DEST_REG_MASK) {
         // Store dest back to memory
-        MOVAPS(MDisp(REGISTERS, dest_offset_disp), src);
+        MOVAPS(MDisp(STATE, dest_offset_disp), src);
 
     } else {
         // Not all components are enabled, so mask the result when storing to the destination register...
-        MOVAPS(SCRATCH, MDisp(REGISTERS, dest_offset_disp));
+        MOVAPS(SCRATCH, MDisp(STATE, dest_offset_disp));
 
         if (Common::GetCPUCaps().sse4_1) {
             u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
@@ -287,7 +287,7 @@ void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) {
         }
 
         // Store dest back to memory
-        MOVAPS(MDisp(REGISTERS, dest_offset_disp), SCRATCH);
+        MOVAPS(MDisp(STATE, dest_offset_disp), SCRATCH);
     }
 }
 
@@ -336,8 +336,8 @@ void JitShader::Compile_EvaluateCondition(Instruction instr) {
 }
 
 void JitShader::Compile_UniformCondition(Instruction instr) {
-    int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool));
-    CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0));
+    int offset = ShaderSetup::UniformOffset(RegisterType::BoolUniform, instr.flow_control.bool_uniform_id);
+    CMP(sizeof(bool) * 8, MDisp(SETUP, offset), Imm8(0));
 }
 
 BitSet32 JitShader::PersistentCallerSavedRegs() {
@@ -714,8 +714,8 @@ void JitShader::Compile_LOOP(Instruction instr) {
 
     looping = true;
 
-    int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4<u8>));
-    MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset));
+    int offset = ShaderSetup::UniformOffset(RegisterType::IntUniform, instr.flow_control.int_uniform_id);
+    MOV(32, R(LOOPCOUNT), MDisp(SETUP, offset));
     MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT));
     SHR(32, R(LOOPCOUNT_REG), Imm8(8));
     AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start
@@ -826,8 +826,8 @@ void JitShader::Compile() {
     // The stack pointer is 8 modulo 16 at the entry of a procedure
     ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
 
-    MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1));
-    MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms));
+    MOV(PTRBITS, R(SETUP), R(ABI_PARAM1));
+    MOV(PTRBITS, R(STATE), R(ABI_PARAM2));
 
     // Zero address/loop  registers
     XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0));
@@ -845,7 +845,7 @@ void JitShader::Compile() {
     MOVAPS(NEGBIT, MatR(RAX));
 
     // Jump to start of the shader program
-    JMPptr(R(ABI_PARAM2));
+    JMPptr(R(ABI_PARAM3));
 
     // Compile entire program
     Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size()));
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index 30aa7ff30..5468459d4 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -36,8 +36,8 @@ class JitShader : public Gen::XCodeBlock {
 public:
     JitShader();
 
-    void Run(void* registers, unsigned offset) const {
-        program(registers, code_ptr[offset]);
+    void Run(const ShaderSetup& setup, UnitState<false>& state, unsigned offset) const {
+        program(&setup, &state, code_ptr[offset]);
     }
 
     void Compile();
@@ -117,7 +117,7 @@ private:
     /// Branches that need to be fixed up once the entire shader program is compiled
     std::vector<std::pair<Gen::FixupBranch, unsigned>> fixup_branches;
 
-    using CompiledShader = void(void* registers, const u8* start_addr);
+    using CompiledShader = void(const void* setup, void* state, const u8* start_addr);
     CompiledShader* program = nullptr;
 };
 
diff --git a/src/video_core/vertex_loader.cpp b/src/video_core/vertex_loader.cpp
index 21ae52949..e40f0f1ee 100644
--- a/src/video_core/vertex_loader.cpp
+++ b/src/video_core/vertex_loader.cpp
@@ -2,8 +2,8 @@
 
 #include <boost/range/algorithm/fill.hpp>
 
-#include "common/assert.h"
 #include "common/alignment.h"
+#include "common/assert.h"
 #include "common/bit_field.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
@@ -21,6 +21,8 @@
 namespace Pica {
 
 void VertexLoader::Setup(const Pica::Regs& regs) {
+    ASSERT_MSG(!is_setup, "VertexLoader is not intended to be setup more than once.");
+
     const auto& attribute_config = regs.vertex_attributes;
     num_total_attributes = attribute_config.GetNumTotalAttributes();
 
@@ -60,9 +62,13 @@ void VertexLoader::Setup(const Pica::Regs& regs) {
             }
         }
     }
+
+    is_setup = true;
 }
 
 void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input, DebugUtils::MemoryAccessTracker& memory_accesses) {
+    ASSERT_MSG(is_setup, "A VertexLoader needs to be setup before loading vertices.");
+
     for (int i = 0; i < num_total_attributes; ++i) {
         if (vertex_attribute_elements[i] != 0) {
             // Load per-vertex data from the loader arrays
@@ -124,7 +130,7 @@ void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::I
                 input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32());
         } else if (vertex_attribute_is_default[i]) {
             // Load the default attribute if we're configured to do so
-            input.attr[i] = g_state.vs.default_attributes[i];
+            input.attr[i] = g_state.vs_default_attributes[i];
             LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)",
                 i, vertex, index,
                 input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(),
diff --git a/src/video_core/vertex_loader.h b/src/video_core/vertex_loader.h
index becf5a403..ac162c254 100644
--- a/src/video_core/vertex_loader.h
+++ b/src/video_core/vertex_loader.h
@@ -1,7 +1,8 @@
 #pragma once
 
-#include "common/common_types.h"
+#include <array>
 
+#include "common/common_types.h"
 #include "video_core/pica.h"
 
 namespace Pica {
@@ -11,23 +12,29 @@ class MemoryAccessTracker;
 }
 
 namespace Shader {
-class InputVertex;
+struct InputVertex;
 }
 
 class VertexLoader {
 public:
+    VertexLoader() = default;
+    explicit VertexLoader(const Pica::Regs& regs) {
+        Setup(regs);
+    }
+
     void Setup(const Pica::Regs& regs);
     void LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input, DebugUtils::MemoryAccessTracker& memory_accesses);
 
     int GetNumTotalAttributes() const { return num_total_attributes; }
 
 private:
-    u32 vertex_attribute_sources[16];
-    u32 vertex_attribute_strides[16] = {};
-    Regs::VertexAttributeFormat vertex_attribute_formats[16] = {};
-    u32 vertex_attribute_elements[16] = {};
-    bool vertex_attribute_is_default[16];
-    int num_total_attributes;
+    std::array<u32, 16> vertex_attribute_sources;
+    std::array<u32, 16> vertex_attribute_strides{};
+    std::array<Regs::VertexAttributeFormat, 16> vertex_attribute_formats;
+    std::array<u32, 16> vertex_attribute_elements{};
+    std::array<bool, 16> vertex_attribute_is_default;
+    int num_total_attributes = 0;
+    bool is_setup = false;
 };
 
 }  // namespace Pica