31 files changed, 1401 insertions, 352 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 0961a3251..82f47d8a9 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(SRCS
             command_processor.cpp
             debug_utils/debug_utils.cpp
+            geometry_pipeline.cpp
             pica.cpp
             primitive_assembly.cpp
             regs.cpp
@@ -15,6 +16,7 @@ set(SRCS
             shader/shader_interpreter.cpp
             swrasterizer/clipper.cpp
             swrasterizer/framebuffer.cpp
+            swrasterizer/lighting.cpp
             swrasterizer/proctex.cpp
             swrasterizer/rasterizer.cpp
             swrasterizer/swrasterizer.cpp
@@ -28,6 +30,7 @@ set(SRCS
 set(HEADERS
             command_processor.h
             debug_utils/debug_utils.h
+            geometry_pipeline.h
             gpu_debugger.h
             pica.h
             pica_state.h
@@ -55,6 +58,7 @@ set(HEADERS
             shader/shader_interpreter.h
             swrasterizer/clipper.h
             swrasterizer/framebuffer.h
+            swrasterizer/lighting.h
             swrasterizer/proctex.h
             swrasterizer/rasterizer.h
             swrasterizer/swrasterizer.h
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 4633a1df1..caf9f7a06 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -119,24 +119,221 @@ static void WriteUniformFloatReg(ShaderRegs& config, Shader::ShaderSetup& setup,
     }
 }
 
-static void WriteProgramCode(ShaderRegs& config, Shader::ShaderSetup& setup,
-                             unsigned max_program_code_length, u32 value) {
-    if (config.program.offset >= max_program_code_length) {
-        LOG_ERROR(HW_GPU, "Invalid %s program offset %d", GetShaderSetupTypeName(setup),
-                  (int)config.program.offset);
-    } else {
-        setup.program_code[config.program.offset] = value;
-        config.program.offset++;
+static void LoadDefaultVertexAttributes(u32 register_value) {
+    auto& regs = g_state.regs;
+
+    // TODO: Does actual hardware indeed keep an intermediate buffer or does
+    //       it directly write the values?
+    default_attr_write_buffer[default_attr_counter++] = register_value;
+
+    // Default attributes are written in a packed format such that four float24 values are encoded
+    // in three 32-bit numbers.
+    // We write to internal memory once a full such vector is written.
+    if (default_attr_counter >= 3) {
+        default_attr_counter = 0;
+
+        auto& setup = regs.pipeline.vs_default_attributes_setup;
+
+        if (setup.index >= 16) {
+            LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
+            return;
+        }
+
+        Math::Vec4<float24> attribute;
+
+        // NOTE: The destination component order indeed is "backwards"
+        attribute.w = float24::FromRaw(default_attr_write_buffer[0] >> 8);
+        attribute.z = float24::FromRaw(((default_attr_write_buffer[0] & 0xFF) << 16) |
+                                       ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
+        attribute.y = float24::FromRaw(((default_attr_write_buffer[1] & 0xFFFF) << 8) |
+                                       ((default_attr_write_buffer[2] >> 24) & 0xFF));
+        attribute.x = float24::FromRaw(default_attr_write_buffer[2] & 0xFFFFFF);
+
+        LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
+                  attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
+                  attribute.w.ToFloat32());
+
+        // TODO: Verify that this actually modifies the register!
+        if (setup.index < 15) {
+            g_state.input_default_attributes.attr[setup.index] = attribute;
+            setup.index++;
+        } else {
+            // Put each attribute into an immediate input buffer.  When all specified immediate
+            // attributes are present, the Vertex Shader is invoked and everything is sent to
+            // the primitive assembler.
+
+            auto& immediate_input = g_state.immediate.input_vertex;
+            auto& immediate_attribute_id = g_state.immediate.current_attribute;
+
+            immediate_input.attr[immediate_attribute_id] = attribute;
+
+            if (immediate_attribute_id < regs.pipeline.max_input_attrib_index) {
+                immediate_attribute_id += 1;
+            } else {
+                MICROPROFILE_SCOPE(GPU_Drawing);
+                immediate_attribute_id = 0;
+
+                auto* shader_engine = Shader::GetEngine();
+                shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
+
+                // Send to vertex shader
+                if (g_debug_context)
+                    g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
+                                             static_cast<void*>(&immediate_input));
+                Shader::UnitState shader_unit;
+                Shader::AttributeBuffer output{};
+
+                shader_unit.LoadInput(regs.vs, immediate_input);
+                shader_engine->Run(g_state.vs, shader_unit);
+                shader_unit.WriteOutput(regs.vs, output);
+
+                // Send to geometry pipeline
+                if (g_state.immediate.reset_geometry_pipeline) {
+                    g_state.geometry_pipeline.Reconfigure();
+                    g_state.immediate.reset_geometry_pipeline = false;
+                }
+                ASSERT(!g_state.geometry_pipeline.NeedIndexInput());
+                g_state.geometry_pipeline.Setup(shader_engine);
+                g_state.geometry_pipeline.SubmitVertex(output);
+
+                // TODO: If drawing after every immediate mode triangle kills performance,
+                // change it to flush triangles whenever a drawing config register changes
+                // See: https://github.com/citra-emu/citra/pull/2866#issuecomment-327011550
+                VideoCore::g_renderer->Rasterizer()->DrawTriangles();
+                if (g_debug_context) {
+                    g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
+                }
+            }
+        }
     }
 }
 
-static void WriteSwizzlePatterns(ShaderRegs& config, Shader::ShaderSetup& setup, u32 value) {
-    if (config.swizzle_patterns.offset >= setup.swizzle_data.size()) {
-        LOG_ERROR(HW_GPU, "Invalid %s swizzle pattern offset %d", GetShaderSetupTypeName(setup),
-                  (int)config.swizzle_patterns.offset);
-    } else {
-        setup.swizzle_data[config.swizzle_patterns.offset] = value;
-        config.swizzle_patterns.offset++;
+static void Draw(u32 command_id) {
+    MICROPROFILE_SCOPE(GPU_Drawing);
+    auto& regs = g_state.regs;
+
+#if PICA_LOG_TEV
+    DebugUtils::DumpTevStageConfig(regs.GetTevStages());
+#endif
+    if (g_debug_context)
+        g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
+
+    // Processes information about internal vertex attributes to figure out how a vertex is
+    // loaded.
+    // Later, these can be compiled and cached.
+    const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
+    VertexLoader loader(regs.pipeline);
+
+    // Load vertices
+    bool is_indexed = (command_id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
+
+    const auto& index_info = regs.pipeline.index_array;
+    const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset);
+    const u16* index_address_16 = reinterpret_cast<const u16*>(index_address_8);
+    bool index_u16 = index_info.format != 0;
+
+    PrimitiveAssembler<Shader::OutputVertex>& primitive_assembler = g_state.primitive_assembler;
+
+    if (g_debug_context && g_debug_context->recorder) {
+        for (int i = 0; i < 3; ++i) {
+            const auto texture = regs.texturing.GetTextures()[i];
+            if (!texture.enabled)
+                continue;
+
+            u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
+            g_debug_context->recorder->MemoryAccessed(
+                texture_data, Pica::TexturingRegs::NibblesPerPixel(texture.format) *
+                                  texture.config.width / 2 * texture.config.height,
+                texture.config.GetPhysicalAddress());
+        }
+    }
+
+    DebugUtils::MemoryAccessTracker memory_accesses;
+
+    // Simple circular-replacement vertex cache
+    // The size has been tuned for optimal balance between hit-rate and the cost of lookup
+    const size_t VERTEX_CACHE_SIZE = 32;
+    std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
+    std::array<Shader::AttributeBuffer, VERTEX_CACHE_SIZE> vertex_cache;
+    Shader::AttributeBuffer vs_output;
+
+    unsigned int vertex_cache_pos = 0;
+    vertex_cache_ids.fill(-1);
+
+    auto* shader_engine = Shader::GetEngine();
+    Shader::UnitState shader_unit;
+
+    shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
+
+    g_state.geometry_pipeline.Reconfigure();
+    g_state.geometry_pipeline.Setup(shader_engine);
+    if (g_state.geometry_pipeline.NeedIndexInput())
+        ASSERT(is_indexed);
+
+    for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) {
+        // Indexed rendering doesn't use the start offset
+        unsigned int vertex = is_indexed
+                                  ? (index_u16 ? index_address_16[index] : index_address_8[index])
+                                  : (index + regs.pipeline.vertex_offset);
+
+        // -1 is a common special value used for primitive restart. Since it's unknown if
+        // the PICA supports it, and it would mess up the caching, guard against it here.
+        ASSERT(vertex != -1);
+
+        bool vertex_cache_hit = false;
+
+        if (is_indexed) {
+            if (g_state.geometry_pipeline.NeedIndexInput()) {
+                g_state.geometry_pipeline.SubmitIndex(vertex);
+                continue;
+            }
+
+            if (g_debug_context && Pica::g_debug_context->recorder) {
+                int size = index_u16 ? 2 : 1;
+                memory_accesses.AddAccess(base_address + index_info.offset + size * index, size);
+            }
+
+            for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
+                if (vertex == vertex_cache_ids[i]) {
+                    vs_output = vertex_cache[i];
+                    vertex_cache_hit = true;
+                    break;
+                }
+            }
+        }
+
+        if (!vertex_cache_hit) {
+            // Initialize data for the current vertex
+            Shader::AttributeBuffer input;
+            loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
+
+            // Send to vertex shader
+            if (g_debug_context)
+                g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
+                                         (void*)&input);
+            shader_unit.LoadInput(regs.vs, input);
+            shader_engine->Run(g_state.vs, shader_unit);
+            shader_unit.WriteOutput(regs.vs, vs_output);
+
+            if (is_indexed) {
+                vertex_cache[vertex_cache_pos] = vs_output;
+                vertex_cache_ids[vertex_cache_pos] = vertex;
+                vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
+            }
+        }
+
+        // Send to geometry pipeline
+        g_state.geometry_pipeline.SubmitVertex(vs_output);
+    }
+
+    for (auto& range : memory_accesses.ranges) {
+        g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first),
+                                                  range.second, range.first);
+    }
+
+    VideoCore::g_renderer->Rasterizer()->DrawTriangles();
+    if (g_debug_context) {
+        g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
     }
 }
 
@@ -182,106 +379,19 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
     case PICA_REG_INDEX(pipeline.vs_default_attributes_setup.index):
         g_state.immediate.current_attribute = 0;
+        g_state.immediate.reset_geometry_pipeline = true;
         default_attr_counter = 0;
         break;
 
     // Load default vertex input attributes
     case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[0], 0x233):
     case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[1], 0x234):
-    case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[2], 0x235): {
-        // TODO: Does actual hardware indeed keep an intermediate buffer or does
-        //       it directly write the values?
-        default_attr_write_buffer[default_attr_counter++] = value;
-
-        // Default attributes are written in a packed format such that four float24 values are
-        // encoded in
-        // three 32-bit numbers. We write to internal memory once a full such vector is
-        // written.
-        if (default_attr_counter >= 3) {
-            default_attr_counter = 0;
-
-            auto& setup = regs.pipeline.vs_default_attributes_setup;
-
-            if (setup.index >= 16) {
-                LOG_ERROR(HW_GPU, "Invalid VS default attribute index %d", (int)setup.index);
-                break;
-            }
-
-            Math::Vec4<float24> attribute;
-
-            // NOTE: The destination component order indeed is "backwards"
-            attribute.w = float24::FromRaw(default_attr_write_buffer[0] >> 8);
-            attribute.z = float24::FromRaw(((default_attr_write_buffer[0] & 0xFF) << 16) |
-                                           ((default_attr_write_buffer[1] >> 16) & 0xFFFF));
-            attribute.y = float24::FromRaw(((default_attr_write_buffer[1] & 0xFFFF) << 8) |
-                                           ((default_attr_write_buffer[2] >> 24) & 0xFF));
-            attribute.x = float24::FromRaw(default_attr_write_buffer[2] & 0xFFFFFF);
-
-            LOG_TRACE(HW_GPU, "Set default VS attribute %x to (%f %f %f %f)", (int)setup.index,
-                      attribute.x.ToFloat32(), attribute.y.ToFloat32(), attribute.z.ToFloat32(),
-                      attribute.w.ToFloat32());
-
-            // TODO: Verify that this actually modifies the register!
-            if (setup.index < 15) {
-                g_state.input_default_attributes.attr[setup.index] = attribute;
-                setup.index++;
-            } else {
-                // Put each attribute into an immediate input buffer.  When all specified immediate
-                // attributes are present, the Vertex Shader is invoked and everything is sent to
-                // the primitive assembler.
-
-                auto& immediate_input = g_state.immediate.input_vertex;
-                auto& immediate_attribute_id = g_state.immediate.current_attribute;
-
-                immediate_input.attr[immediate_attribute_id] = attribute;
-
-                if (immediate_attribute_id < regs.pipeline.max_input_attrib_index) {
-                    immediate_attribute_id += 1;
-                } else {
-                    MICROPROFILE_SCOPE(GPU_Drawing);
-                    immediate_attribute_id = 0;
-
-                    auto* shader_engine = Shader::GetEngine();
-                    shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
-
-                    // Send to vertex shader
-                    if (g_debug_context)
-                        g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
-                                                 static_cast<void*>(&immediate_input));
-                    Shader::UnitState shader_unit;
-                    Shader::AttributeBuffer output{};
-
-                    shader_unit.LoadInput(regs.vs, immediate_input);
-                    shader_engine->Run(g_state.vs, shader_unit);
-                    shader_unit.WriteOutput(regs.vs, output);
-
-                    // Send to renderer
-                    using Pica::Shader::OutputVertex;
-                    auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1,
-                                          const OutputVertex& v2) {
-                        VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
-                    };
-
-                    g_state.primitive_assembler.SubmitVertex(
-                        Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output),
-                        AddTriangle);
-                }
-            }
-        }
+    case PICA_REG_INDEX_WORKAROUND(pipeline.vs_default_attributes_setup.set_value[2], 0x235):
+        LoadDefaultVertexAttributes(value);
         break;
-    }
 
     case PICA_REG_INDEX(pipeline.gpu_mode):
-        if (regs.pipeline.gpu_mode == PipelineRegs::GPUMode::Configuring) {
-            MICROPROFILE_SCOPE(GPU_Drawing);
-
-            // Draw immediate mode triangles when GPU Mode is set to GPUMode::Configuring
-            VideoCore::g_renderer->Rasterizer()->DrawTriangles();
-
-            if (g_debug_context) {
-                g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
-            }
-        }
+        // This register likely just enables vertex processing and doesn't need any special handling
         break;
 
     case PICA_REG_INDEX_WORKAROUND(pipeline.command_buffer.trigger[0], 0x23c):
@@ -297,130 +407,9 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
 
     // It seems like these trigger vertex rendering
     case PICA_REG_INDEX(pipeline.trigger_draw):
-    case PICA_REG_INDEX(pipeline.trigger_draw_indexed): {
-        MICROPROFILE_SCOPE(GPU_Drawing);
-
-#if PICA_LOG_TEV
-        DebugUtils::DumpTevStageConfig(regs.GetTevStages());
-#endif
-        if (g_debug_context)
-            g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
-
-        // Processes information about internal vertex attributes to figure out how a vertex is
-        // loaded.
-        // Later, these can be compiled and cached.
-        const u32 base_address = regs.pipeline.vertex_attributes.GetPhysicalBaseAddress();
-        VertexLoader loader(regs.pipeline);
-
-        // Load vertices
-        bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
-
-        const auto& index_info = regs.pipeline.index_array;
-        const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset);
-        const u16* index_address_16 = reinterpret_cast<const u16*>(index_address_8);
-        bool index_u16 = index_info.format != 0;
-
-        PrimitiveAssembler<Shader::OutputVertex>& primitive_assembler = g_state.primitive_assembler;
-
-        if (g_debug_context && g_debug_context->recorder) {
-            for (int i = 0; i < 3; ++i) {
-                const auto texture = regs.texturing.GetTextures()[i];
-                if (!texture.enabled)
-                    continue;
-
-                u8* texture_data = Memory::GetPhysicalPointer(texture.config.GetPhysicalAddress());
-                g_debug_context->recorder->MemoryAccessed(
-                    texture_data, Pica::TexturingRegs::NibblesPerPixel(texture.format) *
-                                      texture.config.width / 2 * texture.config.height,
-                    texture.config.GetPhysicalAddress());
-            }
-        }
-
-        DebugUtils::MemoryAccessTracker memory_accesses;
-
-        // Simple circular-replacement vertex cache
-        // The size has been tuned for optimal balance between hit-rate and the cost of lookup
-        const size_t VERTEX_CACHE_SIZE = 32;
-        std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids;
-        std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache;
-        Shader::OutputVertex output_vertex;
-
-        unsigned int vertex_cache_pos = 0;
-        vertex_cache_ids.fill(-1);
-
-        auto* shader_engine = Shader::GetEngine();
-        Shader::UnitState shader_unit;
-
-        shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
-
-        for (unsigned int index = 0; index < regs.pipeline.num_vertices; ++index) {
-            // Indexed rendering doesn't use the start offset
-            unsigned int vertex =
-                is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index])
-                           : (index + regs.pipeline.vertex_offset);
-
-            // -1 is a common special value used for primitive restart. Since it's unknown if
-            // the PICA supports it, and it would mess up the caching, guard against it here.
-            ASSERT(vertex != -1);
-
-            bool vertex_cache_hit = false;
-
-            if (is_indexed) {
-                if (g_debug_context && Pica::g_debug_context->recorder) {
-                    int size = index_u16 ? 2 : 1;
-                    memory_accesses.AddAccess(base_address + index_info.offset + size * index,
-                                              size);
-                }
-
-                for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) {
-                    if (vertex == vertex_cache_ids[i]) {
-                        output_vertex = vertex_cache[i];
-                        vertex_cache_hit = true;
-                        break;
-                    }
-                }
-            }
-
-            if (!vertex_cache_hit) {
-                // Initialize data for the current vertex
-                Shader::AttributeBuffer input, output{};
-                loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
-
-                // Send to vertex shader
-                if (g_debug_context)
-                    g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
-                                             (void*)&input);
-                shader_unit.LoadInput(regs.vs, input);
-                shader_engine->Run(g_state.vs, shader_unit);
-                shader_unit.WriteOutput(regs.vs, output);
-
-                // Retrieve vertex from register data
-                output_vertex = Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, output);
-
-                if (is_indexed) {
-                    vertex_cache[vertex_cache_pos] = output_vertex;
-                    vertex_cache_ids[vertex_cache_pos] = vertex;
-                    vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE;
-                }
-            }
-
-            // Send to renderer
-            using Pica::Shader::OutputVertex;
-            auto AddTriangle = [](const OutputVertex& v0, const OutputVertex& v1,
-                                  const OutputVertex& v2) {
-                VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
-            };
-
-            primitive_assembler.SubmitVertex(output_vertex, AddTriangle);
-        }
-
-        for (auto& range : memory_accesses.ranges) {
-            g_debug_context->recorder->MemoryAccessed(Memory::GetPhysicalPointer(range.first),
-                                                      range.second, range.first);
-        }
-
+    case PICA_REG_INDEX(pipeline.trigger_draw_indexed):
+        Draw(id);
         break;
-    }
 
     case PICA_REG_INDEX(gs.bool_uniforms):
         WriteUniformBoolReg(g_state.gs, g_state.regs.gs.bool_uniforms.Value());
@@ -458,7 +447,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[5], 0x2a1):
     case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[6], 0x2a2):
     case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[7], 0x2a3): {
-        WriteProgramCode(g_state.regs.gs, g_state.gs, 4096, value);
+        u32& offset = g_state.regs.gs.program.offset;
+        if (offset >= 4096) {
+            LOG_ERROR(HW_GPU, "Invalid GS program offset %u", offset);
+        } else {
+            g_state.gs.program_code[offset] = value;
+            offset++;
+        }
         break;
     }
 
@@ -470,11 +465,18 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[5], 0x2ab):
     case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[6], 0x2ac):
     case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[7], 0x2ad): {
-        WriteSwizzlePatterns(g_state.regs.gs, g_state.gs, value);
+        u32& offset = g_state.regs.gs.swizzle_patterns.offset;
+        if (offset >= g_state.gs.swizzle_data.size()) {
+            LOG_ERROR(HW_GPU, "Invalid GS swizzle pattern offset %u", offset);
+        } else {
+            g_state.gs.swizzle_data[offset] = value;
+            offset++;
+        }
         break;
     }
 
     case PICA_REG_INDEX(vs.bool_uniforms):
+        // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this?
         WriteUniformBoolReg(g_state.vs, g_state.regs.vs.bool_uniforms.Value());
         break;
 
@@ -482,6 +484,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[1], 0x2b2):
     case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3):
     case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4): {
+        // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this?
         unsigned index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1));
         auto values = regs.vs.int_uniforms[index];
         WriteUniformIntReg(g_state.vs, index,
@@ -497,6 +500,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[5], 0x2c6):
     case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7):
     case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8): {
+        // TODO (wwylele): does regs.pipeline.gs_unit_exclusive_configuration affect this?
         WriteUniformFloatReg(g_state.regs.vs, g_state.vs, vs_float_regs_counter,
                              vs_uniform_write_buffer, value);
         break;
@@ -510,7 +514,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[5], 0x2d1):
     case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2):
     case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3): {
-        WriteProgramCode(g_state.regs.vs, g_state.vs, 512, value);
+        u32& offset = g_state.regs.vs.program.offset;
+        if (offset >= 512) {
+            LOG_ERROR(HW_GPU, "Invalid VS program offset %u", offset);
+        } else {
+            g_state.vs.program_code[offset] = value;
+            if (!g_state.regs.pipeline.gs_unit_exclusive_configuration) {
+                g_state.gs.program_code[offset] = value;
+            }
+            offset++;
+        }
         break;
     }
 
@@ -522,7 +535,16 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
     case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[5], 0x2db):
     case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc):
     case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd): {
-        WriteSwizzlePatterns(g_state.regs.vs, g_state.vs, value);
+        u32& offset = g_state.regs.vs.swizzle_patterns.offset;
+        if (offset >= g_state.vs.swizzle_data.size()) {
+            LOG_ERROR(HW_GPU, "Invalid VS swizzle pattern offset %u", offset);
+        } else {
+            g_state.vs.swizzle_data[offset] = value;
+            if (!g_state.regs.pipeline.gs_unit_exclusive_configuration) {
+                g_state.gs.swizzle_data[offset] = value;
+            }
+            offset++;
+        }
         break;
     }
 
@@ -620,6 +642,6 @@ void ProcessCommandList(const u32* list, u32 size) {
     }
 }
 
-} // namespace
+} // namespace CommandProcessor
 
-} // namespace
+} // namespace Pica
diff --git a/src/video_core/geometry_pipeline.cpp b/src/video_core/geometry_pipeline.cpp
new file mode 100644
index 000000000..98ff2ccd3
--- /dev/null
+++ b/src/video_core/geometry_pipeline.cpp
@@ -0,0 +1,274 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "video_core/geometry_pipeline.h"
+#include "video_core/pica_state.h"
+#include "video_core/regs.h"
+#include "video_core/renderer_base.h"
+#include "video_core/video_core.h"
+
+namespace Pica {
+
+/// An attribute buffering interface for different pipeline modes
+class GeometryPipelineBackend {
+public:
+    virtual ~GeometryPipelineBackend() = default;
+
+    /// Checks if there is no incomplete data transfer
+    virtual bool IsEmpty() const = 0;
+
+    /// Checks if the pipeline needs a direct input from index buffer
+    virtual bool NeedIndexInput() const = 0;
+
+    /// Submits an index from index buffer
+    virtual void SubmitIndex(unsigned int val) = 0;
+
+    /**
+     * Submits vertex attributes
+     * @param input attributes of a vertex output from vertex shader
+     * @return if the buffer is full and the geometry shader should be invoked
+     */
+    virtual bool SubmitVertex(const Shader::AttributeBuffer& input) = 0;
+};
+
+// In the Point mode, vertex attributes are sent to the input registers in the geometry shader unit.
+// The size of vertex shader outputs and geometry shader inputs are constants. Geometry shader is
+// invoked upon inputs buffer filled up by vertex shader outputs. For example, if we have a geometry
+// shader that takes 6 inputs, and the vertex shader outputs 2 attributes, it would take 3 vertices
+// for one geometry shader invocation.
+// TODO: what happens when the input size is not divisible by the output size?
+class GeometryPipeline_Point : public GeometryPipelineBackend {
+public:
+    GeometryPipeline_Point(const Regs& regs, Shader::GSUnitState& unit) : regs(regs), unit(unit) {
+        ASSERT(regs.pipeline.variable_primitive == 0);
+        ASSERT(regs.gs.input_to_uniform == 0);
+        vs_output_num = regs.pipeline.vs_outmap_total_minus_1_a + 1;
+        size_t gs_input_num = regs.gs.max_input_attribute_index + 1;
+        ASSERT(gs_input_num % vs_output_num == 0);
+        buffer_cur = attribute_buffer.attr;
+        buffer_end = attribute_buffer.attr + gs_input_num;
+    }
+
+    bool IsEmpty() const override {
+        return buffer_cur == attribute_buffer.attr;
+    }
+
+    bool NeedIndexInput() const override {
+        return false;
+    }
+
+    void SubmitIndex(unsigned int val) override {
+        UNREACHABLE();
+    }
+
+    bool SubmitVertex(const Shader::AttributeBuffer& input) override {
+        buffer_cur = std::copy(input.attr, input.attr + vs_output_num, buffer_cur);
+        if (buffer_cur == buffer_end) {
+            buffer_cur = attribute_buffer.attr;
+            unit.LoadInput(regs.gs, attribute_buffer);
+            return true;
+        }
+        return false;
+    }
+
+private:
+    const Regs& regs;
+    Shader::GSUnitState& unit;
+    Shader::AttributeBuffer attribute_buffer;
+    Math::Vec4<float24>* buffer_cur;
+    Math::Vec4<float24>* buffer_end;
+    unsigned int vs_output_num;
+};
+
+// In VariablePrimitive mode, vertex attributes are buffered into the uniform registers in the
+// geometry shader unit. The number of vertex is variable, which is specified by the first index
+// value in the batch. This mode is usually used for subdivision.
+class GeometryPipeline_VariablePrimitive : public GeometryPipelineBackend {
+public:
+    GeometryPipeline_VariablePrimitive(const Regs& regs, Shader::ShaderSetup& setup)
+        : regs(regs), setup(setup) {
+        ASSERT(regs.pipeline.variable_primitive == 1);
+        ASSERT(regs.gs.input_to_uniform == 1);
+        vs_output_num = regs.pipeline.vs_outmap_total_minus_1_a + 1;
+    }
+
+    bool IsEmpty() const override {
+        return need_index;
+    }
+
+    bool NeedIndexInput() const override {
+        return need_index;
+    }
+
+    void SubmitIndex(unsigned int val) override {
+        DEBUG_ASSERT(need_index);
+
+        // The number of vertex input is put to the uniform register
+        float24 vertex_num = float24::FromFloat32(static_cast<float>(val));
+        setup.uniforms.f[0] = Math::MakeVec(vertex_num, vertex_num, vertex_num, vertex_num);
+
+        // The second uniform register and so on are used for receiving input vertices
+        buffer_cur = setup.uniforms.f + 1;
+
+        main_vertex_num = regs.pipeline.variable_vertex_main_num_minus_1 + 1;
+        total_vertex_num = val;
+        need_index = false;
+    }
+
+    bool SubmitVertex(const Shader::AttributeBuffer& input) override {
+        DEBUG_ASSERT(!need_index);
+        if (main_vertex_num != 0) {
+            // For main vertices, receive all attributes
+            buffer_cur = std::copy(input.attr, input.attr + vs_output_num, buffer_cur);
+            --main_vertex_num;
+        } else {
+            // For other vertices, only receive the first attribute (usually the position)
+            *(buffer_cur++) = input.attr[0];
+        }
+        --total_vertex_num;
+
+        if (total_vertex_num == 0) {
+            need_index = true;
+            return true;
+        }
+
+        return false;
+    }
+
+private:
+    bool need_index = true;
+    const Regs& regs;
+    Shader::ShaderSetup& setup;
+    unsigned int main_vertex_num;
+    unsigned int total_vertex_num;
+    Math::Vec4<float24>* buffer_cur;
+    unsigned int vs_output_num;
+};
+
+// In FixedPrimitive mode, vertex attributes are buffered into the uniform registers in the geometry
+// shader unit. The number of vertex per shader invocation is constant. This is usually used for
+// particle system.
+class GeometryPipeline_FixedPrimitive : public GeometryPipelineBackend {
+public:
+    GeometryPipeline_FixedPrimitive(const Regs& regs, Shader::ShaderSetup& setup)
+        : regs(regs), setup(setup) {
+        ASSERT(regs.pipeline.variable_primitive == 0);
+        ASSERT(regs.gs.input_to_uniform == 1);
+        vs_output_num = regs.pipeline.vs_outmap_total_minus_1_a + 1;
+        ASSERT(vs_output_num == regs.pipeline.gs_config.stride_minus_1 + 1);
+        size_t vertex_num = regs.pipeline.gs_config.fixed_vertex_num_minus_1 + 1;
+        buffer_cur = buffer_begin = setup.uniforms.f + regs.pipeline.gs_config.start_index;
+        buffer_end = buffer_begin + vs_output_num * vertex_num;
+    }
+
+    bool IsEmpty() const override {
+        return buffer_cur == buffer_begin;
+    }
+
+    bool NeedIndexInput() const override {
+        return false;
+    }
+
+    void SubmitIndex(unsigned int val) override {
+        UNREACHABLE();
+    }
+
+    bool SubmitVertex(const Shader::AttributeBuffer& input) override {
+        buffer_cur = std::copy(input.attr, input.attr + vs_output_num, buffer_cur);
+        if (buffer_cur == buffer_end) {
+            buffer_cur = buffer_begin;
+            return true;
+        }
+        return false;
+    }
+
+private:
+    const Regs& regs;
+    Shader::ShaderSetup& setup;
+    Math::Vec4<float24>* buffer_begin;
+    Math::Vec4<float24>* buffer_cur;
+    Math::Vec4<float24>* buffer_end;
+    unsigned int vs_output_num;
+};
+
+GeometryPipeline::GeometryPipeline(State& state) : state(state) {}
+
+GeometryPipeline::~GeometryPipeline() = default;
+
+void GeometryPipeline::SetVertexHandler(Shader::VertexHandler vertex_handler) {
+    this->vertex_handler = vertex_handler;
+}
+
+void GeometryPipeline::Setup(Shader::ShaderEngine* shader_engine) {
+    if (!backend)
+        return;
+
+    this->shader_engine = shader_engine;
+    shader_engine->SetupBatch(state.gs, state.regs.gs.main_offset);
+}
+
+void GeometryPipeline::Reconfigure() {
+    ASSERT(!backend || backend->IsEmpty());
+
+    if (state.regs.pipeline.use_gs == PipelineRegs::UseGS::No) {
+        backend = nullptr;
+        return;
+    }
+
+    ASSERT(state.regs.pipeline.use_gs == PipelineRegs::UseGS::Yes);
+
+    // The following assumes that when geometry shader is in use, the shader unit 3 is configured as
+    // a geometry shader unit.
+    // TODO: what happens if this is not true?
+    ASSERT(state.regs.pipeline.gs_unit_exclusive_configuration == 1);
+    ASSERT(state.regs.gs.shader_mode == ShaderRegs::ShaderMode::GS);
+
+    state.gs_unit.ConfigOutput(state.regs.gs);
+
+    ASSERT(state.regs.pipeline.vs_outmap_total_minus_1_a ==
+           state.regs.pipeline.vs_outmap_total_minus_1_b);
+
+    switch (state.regs.pipeline.gs_config.mode) {
+    case PipelineRegs::GSMode::Point:
+        backend = std::make_unique<GeometryPipeline_Point>(state.regs, state.gs_unit);
+        break;
+    case PipelineRegs::GSMode::VariablePrimitive:
+        backend = std::make_unique<GeometryPipeline_VariablePrimitive>(state.regs, state.gs);
+        break;
+    case PipelineRegs::GSMode::FixedPrimitive:
+        backend = std::make_unique<GeometryPipeline_FixedPrimitive>(state.regs, state.gs);
+        break;
+    default:
+        UNREACHABLE();
+    }
+}
+
+bool GeometryPipeline::NeedIndexInput() const {
+    if (!backend)
+        return false;
+    return backend->NeedIndexInput();
+}
+
+void GeometryPipeline::SubmitIndex(unsigned int val) {
+    backend->SubmitIndex(val);
+}
+
+void GeometryPipeline::SubmitVertex(const Shader::AttributeBuffer& input) {
+    if (!backend) {
+        // No backend means the geometry shader is disabled, so we send the vertex shader output
+        // directly to the primitive assembler.
+        vertex_handler(input);
+    } else {
+        if (backend->SubmitVertex(input)) {
+            shader_engine->Run(state.gs, state.gs_unit);
+
+            // The uniform b15 is set to true after every geometry shader invocation. This is useful
+            // for the shader to know if this is the first invocation in a batch, if the program set
+            // b15 to false first.
+            state.gs.uniforms.b[15] = true;
+        }
+    }
+}
+
+} // namespace Pica
diff --git a/src/video_core/geometry_pipeline.h b/src/video_core/geometry_pipeline.h
new file mode 100644
index 000000000..91fdd3192
--- /dev/null
+++ b/src/video_core/geometry_pipeline.h
@@ -0,0 +1,49 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include "video_core/shader/shader.h"
+
+namespace Pica {
+
+struct State;
+
+class GeometryPipelineBackend;
+
+/// A pipeline receiving from vertex shader and sending to geometry shader and primitive assembler
+class GeometryPipeline {
+public:
+    explicit GeometryPipeline(State& state);
+    ~GeometryPipeline();
+
+    /// Sets the handler for receiving vertex outputs from vertex shader
+    void SetVertexHandler(Shader::VertexHandler vertex_handler);
+
+    /**
+     * Setup the geometry shader unit if it is in use
+     * @param shader_engine the shader engine for the geometry shader to run
+     */
+    void Setup(Shader::ShaderEngine* shader_engine);
+
+    /// Reconfigures the pipeline according to current register settings
+    void Reconfigure();
+
+    /// Checks if the pipeline needs a direct input from index buffer
+    bool NeedIndexInput() const;
+
+    /// Submits an index from index buffer. Call this only when NeedIndexInput returns true
+    void SubmitIndex(unsigned int val);
+
+    /// Submits vertex attributes output from vertex shader
+    void SubmitVertex(const Shader::AttributeBuffer& input);
+
+private:
+    Shader::VertexHandler vertex_handler;
+    Shader::ShaderEngine* shader_engine;
+    std::unique_ptr<GeometryPipelineBackend> backend;
+    State& state;
+};
+} // namespace Pica
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index b95148a6a..218e06883 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -3,9 +3,11 @@
 // Refer to the license.txt file included.
 
 #include <cstring>
+#include "video_core/geometry_pipeline.h"
 #include "video_core/pica.h"
 #include "video_core/pica_state.h"
-#include "video_core/regs_pipeline.h"
+#include "video_core/renderer_base.h"
+#include "video_core/video_core.h"
 
 namespace Pica {
 
@@ -24,6 +26,23 @@ void Zero(T& o) {
     memset(&o, 0, sizeof(o));
 }
 
+State::State() : geometry_pipeline(*this) {
+    auto SubmitVertex = [this](const Shader::AttributeBuffer& vertex) {
+        using Pica::Shader::OutputVertex;
+        auto AddTriangle = [this](const OutputVertex& v0, const OutputVertex& v1,
+                                  const OutputVertex& v2) {
+            VideoCore::g_renderer->Rasterizer()->AddTriangle(v0, v1, v2);
+        };
+        primitive_assembler.SubmitVertex(
+            Shader::OutputVertex::FromAttributeBuffer(regs.rasterizer, vertex), AddTriangle);
+    };
+
+    auto SetWinding = [this]() { primitive_assembler.SetWinding(); };
+
+    g_state.gs_unit.SetVertexHandler(SubmitVertex, SetWinding);
+    g_state.geometry_pipeline.SetVertexHandler(SubmitVertex);
+}
+
 void State::Reset() {
     Zero(regs);
     Zero(vs);
diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h
index 2d23d34e6..c6634a0bc 100644
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -8,6 +8,7 @@
 #include "common/bit_field.h"
 #include "common/common_types.h"
 #include "common/vector_math.h"
+#include "video_core/geometry_pipeline.h"
 #include "video_core/primitive_assembly.h"
 #include "video_core/regs.h"
 #include "video_core/shader/shader.h"
@@ -16,6 +17,7 @@ namespace Pica {
 
 /// Struct used to describe current Pica state
 struct State {
+    State();
     void Reset();
 
     /// Pica registers
@@ -79,7 +81,7 @@ struct State {
         std::array<ColorDifferenceEntry, 256> color_diff_table;
     } proctex;
 
-    struct {
+    struct Lighting {
         union LutEntry {
             // Used for raw access
             u32 raw;
@@ -137,8 +139,17 @@ struct State {
         Shader::AttributeBuffer input_vertex;
         // Index of the next attribute to be loaded into `input_vertex`.
         u32 current_attribute = 0;
+        // Indicates the immediate mode just started and the geometry pipeline needs to reconfigure
+        bool reset_geometry_pipeline = true;
     } immediate;
 
+    // the geometry shader needs to be kept in the global state because some shaders relie on
+    // preserved register value across shader invocation.
+    // TODO: also bring the three vertex shader units here and implement the shader scheduler.
+    Shader::GSUnitState gs_unit;
+
+    GeometryPipeline geometry_pipeline;
+
     // This is constructed with a dummy triangle topology
     PrimitiveAssembler<Shader::OutputVertex> primitive_assembler;
 };
diff --git a/src/video_core/pica_types.h b/src/video_core/pica_types.h
index 5d7e10066..2eafa7e9e 100644
--- a/src/video_core/pica_types.h
+++ b/src/video_core/pica_types.h
@@ -58,11 +58,12 @@ public:
     }
 
     Float<M, E> operator*(const Float<M, E>& flt) const {
-        if ((this->value == 0.f && !std::isnan(flt.value)) ||
-            (flt.value == 0.f && !std::isnan(this->value)))
-            // PICA gives 0 instead of NaN when multiplying by inf
-            return Zero();
-        return Float<M, E>::FromFloat32(ToFloat32() * flt.ToFloat32());
+        float result = value * flt.ToFloat32();
+        // PICA gives 0 instead of NaN when multiplying by inf
+        if (!std::isnan(value) && !std::isnan(flt.ToFloat32()))
+            if (std::isnan(result))
+                result = 0.f;
+        return Float<M, E>::FromFloat32(result);
     }
 
     Float<M, E> operator/(const Float<M, E>& flt) const {
@@ -78,12 +79,7 @@ public:
     }
 
     Float<M, E>& operator*=(const Float<M, E>& flt) {
-        if ((this->value == 0.f && !std::isnan(flt.value)) ||
-            (flt.value == 0.f && !std::isnan(this->value)))
-            // PICA gives 0 instead of NaN when multiplying by inf
-            *this = Zero();
-        else
-            value *= flt.ToFloat32();
+        value = operator*(flt).value;
         return *this;
     }
 
diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
index acd2ac5e2..9c3dd4cab 100644
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@@ -17,15 +17,18 @@ template <typename VertexType>
 void PrimitiveAssembler<VertexType>::SubmitVertex(const VertexType& vtx,
                                                   TriangleHandler triangle_handler) {
     switch (topology) {
-    // TODO: Figure out what's different with TriangleTopology::Shader.
     case PipelineRegs::TriangleTopology::List:
     case PipelineRegs::TriangleTopology::Shader:
         if (buffer_index < 2) {
             buffer[buffer_index++] = vtx;
         } else {
             buffer_index = 0;
-
-            triangle_handler(buffer[0], buffer[1], vtx);
+            if (topology == PipelineRegs::TriangleTopology::Shader && winding) {
+                triangle_handler(buffer[1], buffer[0], vtx);
+                winding = false;
+            } else {
+                triangle_handler(buffer[0], buffer[1], vtx);
+            }
         }
         break;
 
@@ -51,9 +54,15 @@ void PrimitiveAssembler<VertexType>::SubmitVertex(const VertexType& vtx,
 }
 
 template <typename VertexType>
+void PrimitiveAssembler<VertexType>::SetWinding() {
+    winding = true;
+}
+
+template <typename VertexType>
 void PrimitiveAssembler<VertexType>::Reset() {
     buffer_index = 0;
     strip_ready = false;
+    winding = false;
 }
 
 template <typename VertexType>
diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h
index e8eccdf27..12de8e3b9 100644
--- a/src/video_core/primitive_assembly.h
+++ b/src/video_core/primitive_assembly.h
@@ -30,6 +30,12 @@ struct PrimitiveAssembler {
     void SubmitVertex(const VertexType& vtx, TriangleHandler triangle_handler);
 
     /**
+     * Invert the vertex order of the next triangle. Called by geometry shader emitter.
+     * This only takes effect for TriangleTopology::Shader.
+     */
+    void SetWinding();
+
+    /**
      * Resets the internal state of the PrimitiveAssembler.
      */
     void Reset();
@@ -45,6 +51,7 @@ private:
     int buffer_index;
     VertexType buffer[2];
     bool strip_ready = false;
+    bool winding = false;
 };
 
 } // namespace
diff --git a/src/video_core/regs_framebuffer.h b/src/video_core/regs_framebuffer.h
index a50bd4111..7b565f911 100644
--- a/src/video_core/regs_framebuffer.h
+++ b/src/video_core/regs_framebuffer.h
@@ -256,10 +256,9 @@ struct FramebufferRegs {
             return 3;
         case DepthFormat::D24S8:
             return 4;
-        default:
-            LOG_CRITICAL(HW_GPU, "Unknown depth format %u", format);
-            UNIMPLEMENTED();
         }
+
+        ASSERT_MSG(false, "Unknown depth format %u", format);
     }
 
     // Returns the number of bits per depth component of the specified depth format
@@ -270,10 +269,9 @@ struct FramebufferRegs {
         case DepthFormat::D24:
         case DepthFormat::D24S8:
             return 24;
-        default:
-            LOG_CRITICAL(HW_GPU, "Unknown depth format %u", format);
-            UNIMPLEMENTED();
         }
+
+        ASSERT_MSG(false, "Unknown depth format %u", format);
     }
 
     INSERT_PADDING_WORDS(0x20);
diff --git a/src/video_core/regs_pipeline.h b/src/video_core/regs_pipeline.h
index 31c747d77..e78c3e331 100644
--- a/src/video_core/regs_pipeline.h
+++ b/src/video_core/regs_pipeline.h
@@ -147,7 +147,15 @@ struct PipelineRegs {
     // Number of vertices to render
     u32 num_vertices;
 
-    INSERT_PADDING_WORDS(0x1);
+    enum class UseGS : u32 {
+        No = 0,
+        Yes = 2,
+    };
+
+    union {
+        BitField<0, 2, UseGS> use_gs;
+        BitField<31, 1, u32> variable_primitive;
+    };
 
     // The index of the first vertex to render
     u32 vertex_offset;
@@ -202,7 +210,14 @@ struct PipelineRegs {
     /// Number of input attributes to the vertex shader minus 1
     BitField<0, 4, u32> max_input_attrib_index;
 
-    INSERT_PADDING_WORDS(2);
+    INSERT_PADDING_WORDS(1);
+
+    // The shader unit 3, which can be used for both vertex and geometry shader, gets its
+    // configuration depending on this register. If this is not set, unit 3 will share some
+    // configuration with other units. It is known that program code and swizzle pattern uploaded
+    // via regs.vs will be also uploaded to unit 3 if this is not set. Although very likely, it is
+    // still unclear whether uniforms and other configuration can be also shared.
+    BitField<0, 1, u32> gs_unit_exclusive_configuration;
 
     enum class GPUMode : u32 {
         Drawing = 0,
@@ -211,7 +226,29 @@ struct PipelineRegs {
 
     GPUMode gpu_mode;
 
-    INSERT_PADDING_WORDS(0x18);
+    INSERT_PADDING_WORDS(0x4);
+    BitField<0, 4, u32> vs_outmap_total_minus_1_a;
+    INSERT_PADDING_WORDS(0x6);
+    BitField<0, 4, u32> vs_outmap_total_minus_1_b;
+
+    enum class GSMode : u32 {
+        Point = 0,
+        VariablePrimitive = 1,
+        FixedPrimitive = 2,
+    };
+
+    union {
+        BitField<0, 8, GSMode> mode;
+        BitField<8, 4, u32> fixed_vertex_num_minus_1;
+        BitField<12, 4, u32> stride_minus_1;
+        BitField<16, 4, u32> start_index;
+    } gs_config;
+
+    INSERT_PADDING_WORDS(0x1);
+
+    u32 variable_vertex_main_num_minus_1;
+
+    INSERT_PADDING_WORDS(0x9);
 
     enum class TriangleTopology : u32 {
         List = 0,
diff --git a/src/video_core/regs_rasterizer.h b/src/video_core/regs_rasterizer.h
index 2874fd127..4fef00d76 100644
--- a/src/video_core/regs_rasterizer.h
+++ b/src/video_core/regs_rasterizer.h
@@ -5,10 +5,10 @@
 #pragma once
 
 #include <array>
-
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "video_core/pica_types.h"
 
 namespace Pica {
 
@@ -31,7 +31,17 @@ struct RasterizerRegs {
 
     BitField<0, 24, u32> viewport_size_y;
 
-    INSERT_PADDING_WORDS(0x9);
+    INSERT_PADDING_WORDS(0x3);
+
+    BitField<0, 1, u32> clip_enable;
+    BitField<0, 24, u32> clip_coef[4]; // float24
+
+    Math::Vec4<float24> GetClipCoef() const {
+        return {float24::FromRaw(clip_coef[0]), float24::FromRaw(clip_coef[1]),
+                float24::FromRaw(clip_coef[2]), float24::FromRaw(clip_coef[3])};
+    }
+
+    INSERT_PADDING_WORDS(0x1);
 
     BitField<0, 24, u32> viewport_depth_range;      // float24
     BitField<0, 24, u32> viewport_depth_near_plane; // float24
diff --git a/src/video_core/regs_shader.h b/src/video_core/regs_shader.h
index ddb1ee451..c15d4d162 100644
--- a/src/video_core/regs_shader.h
+++ b/src/video_core/regs_shader.h
@@ -24,9 +24,16 @@ struct ShaderRegs {
 
     INSERT_PADDING_WORDS(0x4);
 
+    enum ShaderMode {
+        GS = 0x08,
+        VS = 0xA0,
+    };
+
     union {
         // Number of input attributes to shader unit - 1
         BitField<0, 4, u32> max_input_attribute_index;
+        BitField<8, 8, u32> input_to_uniform;
+        BitField<24, 8, ShaderMode> shader_mode;
     };
 
     // Offset to shader program entry point (in words)
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 1c6c15a58..7e09e4712 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -28,6 +28,9 @@ MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255));
 MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
 
 RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
+    // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0
+    state.clip_distance[0] = true;
+
     // Create sampler objects
     for (size_t i = 0; i < texture_samplers.size(); ++i) {
         texture_samplers[i].Create();
@@ -166,6 +169,8 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
     glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, proctex_diff_lut_buffer.handle);
 
     // Sync fixed function OpenGL state
+    SyncClipEnabled();
+    SyncClipCoef();
     SyncCullMode();
     SyncBlendEnabled();
     SyncBlendFuncs();
@@ -232,13 +237,24 @@ void RasterizerOpenGL::DrawTriangles() {
 
     glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D,
                            color_surface != nullptr ? color_surface->texture.handle : 0, 0);
-    glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
-                           depth_surface != nullptr ? depth_surface->texture.handle : 0, 0);
-    bool has_stencil =
-        regs.framebuffer.framebuffer.depth_format == Pica::FramebufferRegs::DepthFormat::D24S8;
-    glFramebufferTexture2D(
-        GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
-        (has_stencil && depth_surface != nullptr) ? depth_surface->texture.handle : 0, 0);
+    if (depth_surface != nullptr) {
+        if (regs.framebuffer.framebuffer.depth_format ==
+            Pica::FramebufferRegs::DepthFormat::D24S8) {
+            // attach both depth and stencil
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
+                                   depth_surface->texture.handle, 0);
+        } else {
+            // attach depth
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D,
+                                   depth_surface->texture.handle, 0);
+            // clear stencil attachment
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
+        }
+    } else {
+        // clear both depth and stencil attachment
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
+                               0);
+    }
 
     // Sync the viewport
     // These registers hold half-width and half-height, so must be multiplied by 2
@@ -398,6 +414,18 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
         SyncCullMode();
         break;
 
+    // Clipping plane
+    case PICA_REG_INDEX(rasterizer.clip_enable):
+        SyncClipEnabled();
+        break;
+
+    case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[0], 0x48):
+    case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[1], 0x49):
+    case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[2], 0x4a):
+    case PICA_REG_INDEX_WORKAROUND(rasterizer.clip_coef[3], 0x4b):
+        SyncClipCoef();
+        break;
+
     // Depth modifiers
     case PICA_REG_INDEX(rasterizer.viewport_depth_range):
         SyncDepthScale();
@@ -1277,6 +1305,20 @@ void RasterizerOpenGL::SetShader() {
     }
 }
 
+void RasterizerOpenGL::SyncClipEnabled() {
+    state.clip_distance[1] = Pica::g_state.regs.rasterizer.clip_enable != 0;
+}
+
+void RasterizerOpenGL::SyncClipCoef() {
+    const auto raw_clip_coef = Pica::g_state.regs.rasterizer.GetClipCoef();
+    const GLvec4 new_clip_coef = {raw_clip_coef.x.ToFloat32(), raw_clip_coef.y.ToFloat32(),
+                                  raw_clip_coef.z.ToFloat32(), raw_clip_coef.w.ToFloat32()};
+    if (new_clip_coef != uniform_block_data.data.clip_coef) {
+        uniform_block_data.data.clip_coef = new_clip_coef;
+        uniform_block_data.dirty = true;
+    }
+}
+
 void RasterizerOpenGL::SyncCullMode() {
     const auto& regs = Pica::g_state.regs;
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 78e218efe..46c62961c 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -151,14 +151,21 @@ private:
         LightSrc light_src[8];
         alignas(16) GLvec4 const_color[6]; // A vec4 color for each of the six tev stages
         alignas(16) GLvec4 tev_combiner_buffer_color;
+        alignas(16) GLvec4 clip_coef;
     };
 
     static_assert(
-        sizeof(UniformData) == 0x460,
+        sizeof(UniformData) == 0x470,
         "The size of the UniformData structure has changed, update the structure in the shader");
     static_assert(sizeof(UniformData) < 16384,
                   "UniformData structure must be less than 16kb as per the OpenGL spec");
 
+    /// Syncs the clip enabled status to match the PICA register
+    void SyncClipEnabled();
+
+    /// Syncs the clip coefficients to match the PICA register
+    void SyncClipCoef();
+
     /// Sets the OpenGL shader in accordance with the current PICA register state
     void SetShader();
 
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index bb192affd..9fe183944 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -8,6 +8,7 @@
 #include "common/assert.h"
 #include "common/bit_field.h"
 #include "common/logging/log.h"
+#include "core/core.h"
 #include "video_core/regs_framebuffer.h"
 #include "video_core/regs_lighting.h"
 #include "video_core/regs_rasterizer.h"
@@ -24,6 +25,42 @@ using TevStageConfig = TexturingRegs::TevStageConfig;
 
 namespace GLShader {
 
+static const std::string UniformBlockDef = R"(
+#define NUM_TEV_STAGES 6
+#define NUM_LIGHTS 8
+
+struct LightSrc {
+    vec3 specular_0;
+    vec3 specular_1;
+    vec3 diffuse;
+    vec3 ambient;
+    vec3 position;
+    vec3 spot_direction;
+    float dist_atten_bias;
+    float dist_atten_scale;
+};
+
+layout (std140) uniform shader_data {
+    vec2 framebuffer_scale;
+    int alphatest_ref;
+    float depth_scale;
+    float depth_offset;
+    int scissor_x1;
+    int scissor_y1;
+    int scissor_x2;
+    int scissor_y2;
+    vec3 fog_color;
+    vec2 proctex_noise_f;
+    vec2 proctex_noise_a;
+    vec2 proctex_noise_p;
+    vec3 lighting_global_ambient;
+    LightSrc light_src[NUM_LIGHTS];
+    vec4 const_color[NUM_TEV_STAGES];
+    vec4 tev_combiner_buffer_color;
+    vec4 clip_coef;
+};
+)";
+
 PicaShaderConfig PicaShaderConfig::BuildFromRegs(const Pica::Regs& regs) {
     PicaShaderConfig res;
 
@@ -525,11 +562,12 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
            "float geo_factor = 1.0;\n";
 
     // Compute fragment normals and tangents
-    const std::string pertubation =
-        "2.0 * (" + SampleTexture(config, lighting.bump_selector) + ").rgb - 1.0";
+    auto Perturbation = [&]() {
+        return "2.0 * (" + SampleTexture(config, lighting.bump_selector) + ").rgb - 1.0";
+    };
     if (lighting.bump_mode == LightingRegs::LightingBumpMode::NormalMap) {
         // Bump mapping is enabled using a normal map
-        out += "vec3 surface_normal = " + pertubation + ";\n";
+        out += "vec3 surface_normal = " + Perturbation() + ";\n";
 
         // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher
         // precision result
@@ -543,7 +581,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
         out += "vec3 surface_tangent = vec3(1.0, 0.0, 0.0);\n";
     } else if (lighting.bump_mode == LightingRegs::LightingBumpMode::TangentMap) {
         // Bump mapping is enabled using a tangent map
-        out += "vec3 surface_tangent = " + pertubation + ";\n";
+        out += "vec3 surface_tangent = " + Perturbation() + ";\n";
         // Mathematically, recomputing Z-component of the tangent vector won't affect the relevant
         // computation below, which is also confirmed on 3DS. So we don't bother recomputing here
         // even if 'renorm' is enabled.
@@ -593,8 +631,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
                 // Note: even if the normal vector is modified by normal map, which is not the
                 // normal of the tangent plane anymore, the half angle vector is still projected
                 // using the modified normal vector.
-                std::string half_angle_proj = "normalize(half_vector) - normal / dot(normal, "
-                                              "normal) * dot(normal, normalize(half_vector))";
+                std::string half_angle_proj =
+                    "normalize(half_vector) - normal * dot(normal, normalize(half_vector))";
                 // Note: the half angle vector projection is confirmed not normalized before the dot
                 // product. The result is in fact not cos(phi) as the name suggested.
                 index = "dot(" + half_angle_proj + ", tangent)";
@@ -749,7 +787,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
         }
 
         // Fresnel
-        if (lighting.lut_fr.enable &&
+        // Note: only the last entry in the light slots applies the Fresnel factor
+        if (light_index == lighting.src_num - 1 && lighting.lut_fr.enable &&
             LightingRegs::IsLightingSamplerSupported(lighting.config,
                                                      LightingRegs::LightingSampler::Fresnel)) {
             // Lookup fresnel LUT value
@@ -758,17 +797,17 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
                             lighting.lut_fr.type, lighting.lut_fr.abs_input);
             value = "(" + std::to_string(lighting.lut_fr.scale) + " * " + value + ")";
 
-            // Enabled for difffuse lighting alpha component
+            // Enabled for diffuse lighting alpha component
             if (lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
                 lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                out += "diffuse_sum.a  *= " + value + ";\n";
+                out += "diffuse_sum.a = " + value + ";\n";
             }
 
             // Enabled for the specular lighting alpha component
             if (lighting.fresnel_selector ==
                     LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
                 lighting.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
-                out += "specular_sum.a *= " + value + ";\n";
+                out += "specular_sum.a = " + value + ";\n";
             }
         }
 
@@ -1007,8 +1046,6 @@ std::string GenerateFragmentShader(const PicaShaderConfig& config) {
 
     std::string out = R"(
 #version 330 core
-#define NUM_TEV_STAGES 6
-#define NUM_LIGHTS 8
 
 in vec4 primary_color;
 in vec2 texcoord[3];
@@ -1020,36 +1057,6 @@ in vec4 gl_FragCoord;
 
 out vec4 color;
 
-struct LightSrc {
-    vec3 specular_0;
-    vec3 specular_1;
-    vec3 diffuse;
-    vec3 ambient;
-    vec3 position;
-    vec3 spot_direction;
-    float dist_atten_bias;
-    float dist_atten_scale;
-};
-
-layout (std140) uniform shader_data {
-    vec2 framebuffer_scale;
-    int alphatest_ref;
-    float depth_scale;
-    float depth_offset;
-    int scissor_x1;
-    int scissor_y1;
-    int scissor_x2;
-    int scissor_y2;
-    vec3 fog_color;
-    vec2 proctex_noise_f;
-    vec2 proctex_noise_a;
-    vec2 proctex_noise_p;
-    vec3 lighting_global_ambient;
-    LightSrc light_src[NUM_LIGHTS];
-    vec4 const_color[NUM_TEV_STAGES];
-    vec4 tev_combiner_buffer_color;
-};
-
 uniform sampler2D tex[3];
 uniform samplerBuffer lighting_lut;
 uniform samplerBuffer fog_lut;
@@ -1058,7 +1065,11 @@ uniform samplerBuffer proctex_color_map;
 uniform samplerBuffer proctex_alpha_map;
 uniform samplerBuffer proctex_lut;
 uniform samplerBuffer proctex_diff_lut;
+)";
+
+    out += UniformBlockDef;
 
+    out += R"(
 // Rotate the vector v by the quaternion q
 vec3 quaternion_rotate(vec4 q, vec3 v) {
     return v + 2.0 * cross(q.xyz, cross(q.xyz, v) + q.w * v);
@@ -1111,7 +1122,10 @@ vec4 secondary_fragment_color = vec4(0.0);
                "gl_FragCoord.y < scissor_y2)) discard;\n";
     }
 
-    out += "float z_over_w = 1.0 - gl_FragCoord.z * 2.0;\n";
+    // After perspective divide, OpenGL transform z_over_w from [-1, 1] to [near, far]. Here we use
+    // default near = 0 and far = 1, and undo the transformation to get the original z_over_w, then
+    // do our own transformation according to PICA specification.
+    out += "float z_over_w = 2.0 * gl_FragCoord.z - 1.0;\n";
     out += "float depth = z_over_w * depth_scale + depth_offset;\n";
     if (state.depthmap_enable == RasterizerRegs::DepthBuffering::WBuffering) {
         out += "depth /= gl_FragCoord.w;\n";
@@ -1151,6 +1165,11 @@ vec4 secondary_fragment_color = vec4(0.0);
 
         // Blend the fog
         out += "last_tex_env_out.rgb = mix(fog_color.rgb, last_tex_env_out.rgb, fog_factor);\n";
+    } else if (state.fog_mode == TexturingRegs::FogMode::Gas) {
+        Core::Telemetry().AddField(Telemetry::FieldType::Session, "VideoCore_Pica_UseGasMode",
+                                   true);
+        LOG_CRITICAL(Render_OpenGL, "Unimplemented gas mode");
+        UNIMPLEMENTED();
     }
 
     out += "gl_FragDepth = depth;\n";
@@ -1186,6 +1205,12 @@ out float texcoord0_w;
 out vec4 normquat;
 out vec3 view;
 
+)";
+
+    out += UniformBlockDef;
+
+    out += R"(
+
 void main() {
     primary_color = vert_color;
     texcoord[0] = vert_texcoord0;
@@ -1194,7 +1219,9 @@ void main() {
     texcoord0_w = vert_texcoord0_w;
     normquat = vert_normquat;
     view = vert_view;
-    gl_Position = vec4(vert_position.x, vert_position.y, -vert_position.z, vert_position.w);
+    gl_Position = vert_position;
+    gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0
+    gl_ClipDistance[1] = dot(clip_coef, vert_position);
 }
 )";
 
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index bc9d34b84..5770ae08f 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -68,6 +68,8 @@ OpenGLState::OpenGLState() {
     draw.vertex_buffer = 0;
     draw.uniform_buffer = 0;
     draw.shader_program = 0;
+
+    clip_distance = {};
 }
 
 void OpenGLState::Apply() const {
@@ -261,6 +263,17 @@ void OpenGLState::Apply() const {
         glUseProgram(draw.shader_program);
     }
 
+    // Clip distance
+    for (size_t i = 0; i < clip_distance.size(); ++i) {
+        if (clip_distance[i] != cur_state.clip_distance[i]) {
+            if (clip_distance[i]) {
+                glEnable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
+            } else {
+                glDisable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i));
+            }
+        }
+    }
+
     cur_state = *this;
 }
 
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 745a74479..437fe34c4 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <array>
 #include <glad/glad.h>
 
 namespace TextureUnits {
@@ -123,6 +124,8 @@ public:
         GLuint shader_program;   // GL_CURRENT_PROGRAM
     } draw;
 
+    std::array<bool, 2> clip_distance; // GL_CLIP_DISTANCE
+
     OpenGLState();
 
     /// Get the currently active OpenGL state
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 67ed19ba8..2857d2829 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -21,7 +21,8 @@ namespace Pica {
 
 namespace Shader {
 
-OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, AttributeBuffer& input) {
+OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
+                                               const AttributeBuffer& input) {
     // Setup output data
     union {
         OutputVertex ret{};
@@ -51,7 +52,8 @@ OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, Attri
     // The hardware takes the absolute and saturates vertex colors like this, *before* doing
     // interpolation
     for (unsigned i = 0; i < 4; ++i) {
-        ret.color[i] = float24::FromFloat32(std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f));
+        float c = std::fabs(ret.color[i].ToFloat32());
+        ret.color[i] = float24::FromFloat32(c < 1.0f ? c : 1.0f);
     }
 
     LOG_TRACE(HW_GPU, "Output vertex: pos(%.2f, %.2f, %.2f, %.2f), quat(%.2f, %.2f, %.2f, %.2f), "
@@ -82,6 +84,44 @@ void UnitState::WriteOutput(const ShaderRegs& config, AttributeBuffer& output) {
     }
 }
 
+UnitState::UnitState(GSEmitter* emitter) : emitter_ptr(emitter) {}
+
+GSEmitter::GSEmitter() {
+    handlers = new Handlers;
+}
+
+GSEmitter::~GSEmitter() {
+    delete handlers;
+}
+
+void GSEmitter::Emit(Math::Vec4<float24> (&vertex)[16]) {
+    ASSERT(vertex_id < 3);
+    std::copy(std::begin(vertex), std::end(vertex), buffer[vertex_id].begin());
+    if (prim_emit) {
+        if (winding)
+            handlers->winding_setter();
+        for (size_t i = 0; i < buffer.size(); ++i) {
+            AttributeBuffer output;
+            unsigned int output_i = 0;
+            for (unsigned int reg : Common::BitSet<u32>(output_mask)) {
+                output.attr[output_i++] = buffer[i][reg];
+            }
+            handlers->vertex_handler(output);
+        }
+    }
+}
+
+GSUnitState::GSUnitState() : UnitState(&emitter) {}
+
+void GSUnitState::SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter) {
+    emitter.handlers->vertex_handler = std::move(vertex_handler);
+    emitter.handlers->winding_setter = std::move(winding_setter);
+}
+
+void GSUnitState::ConfigOutput(const ShaderRegs& config) {
+    emitter.output_mask = config.output_mask;
+}
+
 MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
 
 #ifdef ARCHITECTURE_x86_64
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index e156f6aef..a3789da01 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <cstddef>
+#include <functional>
 #include <type_traits>
 #include <nihstro/shader_bytecode.h>
 #include "common/assert.h"
@@ -31,6 +32,12 @@ struct AttributeBuffer {
     alignas(16) Math::Vec4<float24> attr[16];
 };
 
+/// Handler type for receiving vertex outputs from vertex shader or geometry shader
+using VertexHandler = std::function<void(const AttributeBuffer&)>;
+
+/// Handler type for signaling to invert the vertex order of the next triangle
+using WindingSetter = std::function<void()>;
+
 struct OutputVertex {
     Math::Vec4<float24> pos;
     Math::Vec4<float24> quat;
@@ -43,7 +50,8 @@ struct OutputVertex {
     INSERT_PADDING_WORDS(1);
     Math::Vec2<float24> tc2;
 
-    static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs, AttributeBuffer& output);
+    static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs,
+                                            const AttributeBuffer& output);
 };
 #define ASSERT_POS(var, pos)                                                                       \
     static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong "       \
@@ -61,12 +69,36 @@ static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
 static_assert(sizeof(OutputVertex) == 24 * sizeof(float), "OutputVertex has invalid size");
 
 /**
+ * This structure contains state information for primitive emitting in geometry shader.
+ */
+struct GSEmitter {
+    std::array<std::array<Math::Vec4<float24>, 16>, 3> buffer;
+    u8 vertex_id;
+    bool prim_emit;
+    bool winding;
+    u32 output_mask;
+
+    // Function objects are hidden behind a raw pointer to make the structure standard layout type,
+    // for JIT to use offsetof to access other members.
+    struct Handlers {
+        VertexHandler vertex_handler;
+        WindingSetter winding_setter;
+    } * handlers;
+
+    GSEmitter();
+    ~GSEmitter();
+    void Emit(Math::Vec4<float24> (&vertex)[16]);
+};
+static_assert(std::is_standard_layout<GSEmitter>::value, "GSEmitter is not standard layout type");
+
+/**
  * This structure contains the state information that needs to be unique for a shader unit. The 3DS
  * has four shader units that process shaders in parallel. At the present, Citra only implements a
  * single shader unit that processes all shaders serially. Putting the state information in a struct
  * here will make it easier for us to parallelize the shader processing later.
  */
 struct UnitState {
+    explicit UnitState(GSEmitter* emitter = nullptr);
     struct Registers {
         // The registers are accessed by the shader JIT using SSE instructions, and are therefore
         // required to be 16-byte aligned.
@@ -82,6 +114,8 @@ struct UnitState {
     // TODO: How many bits do these actually have?
     s32 address_registers[3];
 
+    GSEmitter* emitter_ptr;
+
     static size_t InputOffset(const SourceRegister& reg) {
         switch (reg.GetRegisterType()) {
         case RegisterType::Input:
@@ -125,6 +159,19 @@ struct UnitState {
     void WriteOutput(const ShaderRegs& config, AttributeBuffer& output);
 };
 
+/**
+ * This is an extended shader unit state that represents the special unit that can run both vertex
+ * shader and geometry shader. It contains an additional primitive emitter and utilities for
+ * geometry shader.
+ */
+struct GSUnitState : public UnitState {
+    GSUnitState();
+    void SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter);
+    void ConfigOutput(const ShaderRegs& config);
+
+    GSEmitter emitter;
+};
+
 struct ShaderSetup {
     struct {
         // The float uniforms are accessed by the shader JIT using SSE instructions, and are
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index aa1cec81f..9d4da4904 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -631,11 +631,27 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData
                 state.address_registers[2] = loop_param.y;
 
                 Record<DebugDataRecord::LOOP_INT_IN>(debug_data, iteration, loop_param);
-                call(program_counter + 1, instr.flow_control.dest_offset - program_counter + 1,
+                call(program_counter + 1, instr.flow_control.dest_offset - program_counter,
                      instr.flow_control.dest_offset + 1, loop_param.x, loop_param.z);
                 break;
             }
 
+            case OpCode::Id::EMIT: {
+                GSEmitter* emitter = state.emitter_ptr;
+                ASSERT_MSG(emitter, "Execute EMIT on VS");
+                emitter->Emit(state.registers.output);
+                break;
+            }
+
+            case OpCode::Id::SETEMIT: {
+                GSEmitter* emitter = state.emitter_ptr;
+                ASSERT_MSG(emitter, "Execute SETEMIT on VS");
+                emitter->vertex_id = instr.setemit.vertex_id;
+                emitter->prim_emit = instr.setemit.prim_emit != 0;
+                emitter->winding = instr.setemit.winding != 0;
+                break;
+            }
+
             default:
                 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
                           (int)instr.opcode.Value().EffectiveOpCode(),
diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp
index 42a57aab1..1b31623bd 100644
--- a/src/video_core/shader/shader_jit_x64_compiler.cpp
+++ b/src/video_core/shader/shader_jit_x64_compiler.cpp
@@ -75,8 +75,8 @@ const JitFunction instr_table[64] = {
     &JitShader::Compile_IF,    // ifu
     &JitShader::Compile_IF,    // ifc
     &JitShader::Compile_LOOP,  // loop
-    nullptr,                   // emit
-    nullptr,                   // sete
+    &JitShader::Compile_EMIT,  // emit
+    &JitShader::Compile_SETE,  // sete
     &JitShader::Compile_JMP,   // jmpc
     &JitShader::Compile_JMP,   // jmpu
     &JitShader::Compile_CMP,   // cmp
@@ -772,6 +772,51 @@ void JitShader::Compile_JMP(Instruction instr) {
     }
 }
 
+static void Emit(GSEmitter* emitter, Math::Vec4<float24> (*output)[16]) {
+    emitter->Emit(*output);
+}
+
+void JitShader::Compile_EMIT(Instruction instr) {
+    Label have_emitter, end;
+    mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]);
+    test(rax, rax);
+    jnz(have_emitter);
+
+    ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute EMIT on VS"));
+    CallFarFunction(*this, LogCritical);
+    ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    jmp(end);
+
+    L(have_emitter);
+    ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(ABI_PARAM1, rax);
+    mov(ABI_PARAM2, STATE);
+    add(ABI_PARAM2, static_cast<Xbyak::uint32>(offsetof(UnitState, registers.output)));
+    CallFarFunction(*this, Emit);
+    ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    L(end);
+}
+
+void JitShader::Compile_SETE(Instruction instr) {
+    Label have_emitter, end;
+    mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]);
+    test(rax, rax);
+    jnz(have_emitter);
+
+    ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute SETEMIT on VS"));
+    CallFarFunction(*this, LogCritical);
+    ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    jmp(end);
+
+    L(have_emitter);
+    mov(byte[rax + offsetof(GSEmitter, vertex_id)], instr.setemit.vertex_id);
+    mov(byte[rax + offsetof(GSEmitter, prim_emit)], instr.setemit.prim_emit);
+    mov(byte[rax + offsetof(GSEmitter, winding)], instr.setemit.winding);
+    L(end);
+}
+
 void JitShader::Compile_Block(unsigned end) {
     while (program_counter < end) {
         Compile_NextInstr();
diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h
index 31af0ca48..4aee56b1d 100644
--- a/src/video_core/shader/shader_jit_x64_compiler.h
+++ b/src/video_core/shader/shader_jit_x64_compiler.h
@@ -66,6 +66,8 @@ public:
     void Compile_JMP(Instruction instr);
     void Compile_CMP(Instruction instr);
     void Compile_MAD(Instruction instr);
+    void Compile_EMIT(Instruction instr);
+    void Compile_SETE(Instruction instr);
 
 private:
     void Compile_Block(unsigned end);
diff --git a/src/video_core/swrasterizer/clipper.cpp b/src/video_core/swrasterizer/clipper.cpp
index 6fb923756..c1ed48398 100644
--- a/src/video_core/swrasterizer/clipper.cpp
+++ b/src/video_core/swrasterizer/clipper.cpp
@@ -31,7 +31,7 @@ public:
         : coeffs(coeffs), bias(bias) {}
 
     bool IsInside(const Vertex& vertex) const {
-        return Math::Dot(vertex.pos + bias, coeffs) <= float24::FromFloat32(0);
+        return Math::Dot(vertex.pos + bias, coeffs) >= float24::FromFloat32(0);
     }
 
     bool IsOutSide(const Vertex& vertex) const {
@@ -95,6 +95,17 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
     static const size_t MAX_VERTICES = 9;
     static_vector<Vertex, MAX_VERTICES> buffer_a = {v0, v1, v2};
     static_vector<Vertex, MAX_VERTICES> buffer_b;
+
+    auto FlipQuaternionIfOpposite = [](auto& a, const auto& b) {
+        if (Math::Dot(a, b) < float24::Zero())
+            a = a * float24::FromFloat32(-1.0f);
+    };
+
+    // Flip the quaternions if they are opposite to prevent interpolating them over the wrong
+    // direction.
+    FlipQuaternionIfOpposite(buffer_a[1].quat, buffer_a[0].quat);
+    FlipQuaternionIfOpposite(buffer_a[2].quat, buffer_a[0].quat);
+
     auto* output_list = &buffer_a;
     auto* input_list = &buffer_b;
 
@@ -105,23 +116,18 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
     static const float24 f0 = float24::FromFloat32(0.0);
     static const float24 f1 = float24::FromFloat32(1.0);
     static const std::array<ClippingEdge, 7> clipping_edges = {{
-        {Math::MakeVec(f1, f0, f0, -f1)},                                           // x = +w
-        {Math::MakeVec(-f1, f0, f0, -f1)},                                          // x = -w
-        {Math::MakeVec(f0, f1, f0, -f1)},                                           // y = +w
-        {Math::MakeVec(f0, -f1, f0, -f1)},                                          // y = -w
-        {Math::MakeVec(f0, f0, f1, f0)},                                            // z =  0
-        {Math::MakeVec(f0, f0, -f1, -f1)},                                          // z = -w
-        {Math::MakeVec(f0, f0, f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON)}, // w = EPSILON
+        {Math::MakeVec(-f1, f0, f0, f1)},                                          // x = +w
+        {Math::MakeVec(f1, f0, f0, f1)},                                           // x = -w
+        {Math::MakeVec(f0, -f1, f0, f1)},                                          // y = +w
+        {Math::MakeVec(f0, f1, f0, f1)},                                           // y = -w
+        {Math::MakeVec(f0, f0, -f1, f0)},                                          // z =  0
+        {Math::MakeVec(f0, f0, f1, f1)},                                           // z = -w
+        {Math::MakeVec(f0, f0, f0, f1), Math::Vec4<float24>(f0, f0, f0, EPSILON)}, // w = EPSILON
     }};
 
-    // TODO: If one vertex lies outside one of the depth clipping planes, some platforms (e.g. Wii)
-    //       drop the whole primitive instead of clipping the primitive properly. We should test if
-    //       this happens on the 3DS, too.
-
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
-    for (auto edge : clipping_edges) {
-
+    auto Clip = [&](const ClippingEdge& edge) {
         std::swap(input_list, output_list);
         output_list->clear();
 
@@ -140,12 +146,24 @@ void ProcessTriangle(const OutputVertex& v0, const OutputVertex& v1, const Outpu
             }
             reference_vertex = &vertex;
         }
+    };
+
+    for (auto edge : clipping_edges) {
+        Clip(edge);
 
         // Need to have at least a full triangle to continue...
         if (output_list->size() < 3)
             return;
     }
 
+    if (g_state.regs.rasterizer.clip_enable) {
+        ClippingEdge custom_edge{g_state.regs.rasterizer.GetClipCoef()};
+        Clip(custom_edge);
+
+        if (output_list->size() < 3)
+            return;
+    }
+
     InitScreenCoordinates((*output_list)[0]);
     InitScreenCoordinates((*output_list)[1]);
 
diff --git a/src/video_core/swrasterizer/framebuffer.cpp b/src/video_core/swrasterizer/framebuffer.cpp
index 7de3aac75..f34eab6cf 100644
--- a/src/video_core/swrasterizer/framebuffer.cpp
+++ b/src/video_core/swrasterizer/framebuffer.cpp
@@ -352,6 +352,8 @@ u8 LogicOp(u8 src, u8 dest, FramebufferRegs::LogicOp op) {
     case FramebufferRegs::LogicOp::OrInverted:
         return ~src | dest;
     }
+
+    UNREACHABLE();
 };
 
 } // namespace Rasterizer
diff --git a/src/video_core/swrasterizer/lighting.cpp b/src/video_core/swrasterizer/lighting.cpp
new file mode 100644
index 000000000..5fa748611
--- /dev/null
+++ b/src/video_core/swrasterizer/lighting.cpp
@@ -0,0 +1,308 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/math_util.h"
+#include "video_core/swrasterizer/lighting.h"
+
+namespace Pica {
+
+static float LookupLightingLut(const Pica::State::Lighting& lighting, size_t lut_index, u8 index,
+                               float delta) {
+    ASSERT_MSG(lut_index < lighting.luts.size(), "Out of range lut");
+    ASSERT_MSG(index < lighting.luts[lut_index].size(), "Out of range index");
+
+    const auto& lut = lighting.luts[lut_index][index];
+
+    float lut_value = lut.ToFloat();
+    float lut_diff = lut.DiffToFloat();
+
+    return lut_value + lut_diff * delta;
+}
+
+std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
+    const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view,
+    const Math::Vec4<u8> (&texture_color)[4]) {
+
+    Math::Vec3<float> surface_normal;
+    Math::Vec3<float> surface_tangent;
+
+    if (lighting.config0.bump_mode != LightingRegs::LightingBumpMode::None) {
+        Math::Vec3<float> perturbation =
+            texture_color[lighting.config0.bump_selector].xyz().Cast<float>() / 127.5f -
+            Math::MakeVec(1.0f, 1.0f, 1.0f);
+        if (lighting.config0.bump_mode == LightingRegs::LightingBumpMode::NormalMap) {
+            if (!lighting.config0.disable_bump_renorm) {
+                const float z_square = 1 - perturbation.xy().Length2();
+                perturbation.z = std::sqrt(std::max(z_square, 0.0f));
+            }
+            surface_normal = perturbation;
+            surface_tangent = Math::MakeVec(1.0f, 0.0f, 0.0f);
+        } else if (lighting.config0.bump_mode == LightingRegs::LightingBumpMode::TangentMap) {
+            surface_normal = Math::MakeVec(0.0f, 0.0f, 1.0f);
+            surface_tangent = perturbation;
+        } else {
+            LOG_ERROR(HW_GPU, "Unknown bump mode %u", lighting.config0.bump_mode.Value());
+        }
+    } else {
+        surface_normal = Math::MakeVec(0.0f, 0.0f, 1.0f);
+        surface_tangent = Math::MakeVec(1.0f, 0.0f, 0.0f);
+    }
+
+    // Use the normalized the quaternion when performing the rotation
+    auto normal = Math::QuaternionRotate(normquat, surface_normal);
+    auto tangent = Math::QuaternionRotate(normquat, surface_tangent);
+
+    Math::Vec4<float> diffuse_sum = {0.0f, 0.0f, 0.0f, 1.0f};
+    Math::Vec4<float> specular_sum = {0.0f, 0.0f, 0.0f, 1.0f};
+
+    for (unsigned light_index = 0; light_index <= lighting.max_light_index; ++light_index) {
+        unsigned num = lighting.light_enable.GetNum(light_index);
+        const auto& light_config = lighting.light[num];
+
+        Math::Vec3<float> refl_value = {};
+        Math::Vec3<float> position = {float16::FromRaw(light_config.x).ToFloat32(),
+                                      float16::FromRaw(light_config.y).ToFloat32(),
+                                      float16::FromRaw(light_config.z).ToFloat32()};
+        Math::Vec3<float> light_vector;
+
+        if (light_config.config.directional)
+            light_vector = position;
+        else
+            light_vector = position + view;
+
+        light_vector.Normalize();
+
+        Math::Vec3<float> norm_view = view.Normalized();
+        Math::Vec3<float> half_vector = norm_view + light_vector;
+
+        float dist_atten = 1.0f;
+        if (!lighting.IsDistAttenDisabled(num)) {
+            auto distance = (-view - position).Length();
+            float scale = Pica::float20::FromRaw(light_config.dist_atten_scale).ToFloat32();
+            float bias = Pica::float20::FromRaw(light_config.dist_atten_bias).ToFloat32();
+            size_t lut =
+                static_cast<size_t>(LightingRegs::LightingSampler::DistanceAttenuation) + num;
+
+            float sample_loc = MathUtil::Clamp(scale * distance + bias, 0.0f, 1.0f);
+
+            u8 lutindex =
+                static_cast<u8>(MathUtil::Clamp(std::floor(sample_loc * 256.0f), 0.0f, 255.0f));
+            float delta = sample_loc * 256 - lutindex;
+            dist_atten = LookupLightingLut(lighting_state, lut, lutindex, delta);
+        }
+
+        auto GetLutValue = [&](LightingRegs::LightingLutInput input, bool abs,
+                               LightingRegs::LightingScale scale_enum,
+                               LightingRegs::LightingSampler sampler) {
+            float result = 0.0f;
+
+            switch (input) {
+            case LightingRegs::LightingLutInput::NH:
+                result = Math::Dot(normal, half_vector.Normalized());
+                break;
+
+            case LightingRegs::LightingLutInput::VH:
+                result = Math::Dot(norm_view, half_vector.Normalized());
+                break;
+
+            case LightingRegs::LightingLutInput::NV:
+                result = Math::Dot(normal, norm_view);
+                break;
+
+            case LightingRegs::LightingLutInput::LN:
+                result = Math::Dot(light_vector, normal);
+                break;
+
+            case LightingRegs::LightingLutInput::SP: {
+                Math::Vec3<s32> spot_dir{light_config.spot_x.Value(), light_config.spot_y.Value(),
+                                         light_config.spot_z.Value()};
+                result = Math::Dot(light_vector, spot_dir.Cast<float>() / 2047.0f);
+                break;
+            }
+            case LightingRegs::LightingLutInput::CP:
+                if (lighting.config0.config == LightingRegs::LightingConfig::Config7) {
+                    const Math::Vec3<float> norm_half_vector = half_vector.Normalized();
+                    const Math::Vec3<float> half_vector_proj =
+                        norm_half_vector - normal * Math::Dot(normal, norm_half_vector);
+                    result = Math::Dot(half_vector_proj, tangent);
+                } else {
+                    result = 0.0f;
+                }
+                break;
+            default:
+                LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input %u\n", static_cast<u32>(input));
+                UNIMPLEMENTED();
+                result = 0.0f;
+            }
+
+            u8 index;
+            float delta;
+
+            if (abs) {
+                if (light_config.config.two_sided_diffuse)
+                    result = std::abs(result);
+                else
+                    result = std::max(result, 0.0f);
+
+                float flr = std::floor(result * 256.0f);
+                index = static_cast<u8>(MathUtil::Clamp(flr, 0.0f, 255.0f));
+                delta = result * 256 - index;
+            } else {
+                float flr = std::floor(result * 128.0f);
+                s8 signed_index = static_cast<s8>(MathUtil::Clamp(flr, -128.0f, 127.0f));
+                delta = result * 128.0f - signed_index;
+                index = static_cast<u8>(signed_index);
+            }
+
+            float scale = lighting.lut_scale.GetScale(scale_enum);
+            return scale *
+                   LookupLightingLut(lighting_state, static_cast<size_t>(sampler), index, delta);
+        };
+
+        // If enabled, compute spot light attenuation value
+        float spot_atten = 1.0f;
+        if (!lighting.IsSpotAttenDisabled(num) &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::SpotlightAttenuation)) {
+            auto lut = LightingRegs::SpotlightAttenuationSampler(num);
+            spot_atten = GetLutValue(lighting.lut_input.sp, lighting.abs_lut_input.disable_sp == 0,
+                                     lighting.lut_scale.sp, lut);
+        }
+
+        // Specular 0 component
+        float d0_lut_value = 1.0f;
+        if (lighting.config1.disable_lut_d0 == 0 &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::Distribution0)) {
+            d0_lut_value =
+                GetLutValue(lighting.lut_input.d0, lighting.abs_lut_input.disable_d0 == 0,
+                            lighting.lut_scale.d0, LightingRegs::LightingSampler::Distribution0);
+        }
+
+        Math::Vec3<float> specular_0 = d0_lut_value * light_config.specular_0.ToVec3f();
+
+        // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
+        if (lighting.config1.disable_lut_rr == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectRed)) {
+            refl_value.x =
+                GetLutValue(lighting.lut_input.rr, lighting.abs_lut_input.disable_rr == 0,
+                            lighting.lut_scale.rr, LightingRegs::LightingSampler::ReflectRed);
+        } else {
+            refl_value.x = 1.0f;
+        }
+
+        // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
+        if (lighting.config1.disable_lut_rg == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectGreen)) {
+            refl_value.y =
+                GetLutValue(lighting.lut_input.rg, lighting.abs_lut_input.disable_rg == 0,
+                            lighting.lut_scale.rg, LightingRegs::LightingSampler::ReflectGreen);
+        } else {
+            refl_value.y = refl_value.x;
+        }
+
+        // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
+        if (lighting.config1.disable_lut_rb == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::ReflectBlue)) {
+            refl_value.z =
+                GetLutValue(lighting.lut_input.rb, lighting.abs_lut_input.disable_rb == 0,
+                            lighting.lut_scale.rb, LightingRegs::LightingSampler::ReflectBlue);
+        } else {
+            refl_value.z = refl_value.x;
+        }
+
+        // Specular 1 component
+        float d1_lut_value = 1.0f;
+        if (lighting.config1.disable_lut_d1 == 0 &&
+            LightingRegs::IsLightingSamplerSupported(
+                lighting.config0.config, LightingRegs::LightingSampler::Distribution1)) {
+            d1_lut_value =
+                GetLutValue(lighting.lut_input.d1, lighting.abs_lut_input.disable_d1 == 0,
+                            lighting.lut_scale.d1, LightingRegs::LightingSampler::Distribution1);
+        }
+
+        Math::Vec3<float> specular_1 =
+            d1_lut_value * refl_value * light_config.specular_1.ToVec3f();
+
+        // Fresnel
+        // Note: only the last entry in the light slots applies the Fresnel factor
+        if (light_index == lighting.max_light_index && lighting.config1.disable_lut_fr == 0 &&
+            LightingRegs::IsLightingSamplerSupported(lighting.config0.config,
+                                                     LightingRegs::LightingSampler::Fresnel)) {
+
+            float lut_value =
+                GetLutValue(lighting.lut_input.fr, lighting.abs_lut_input.disable_fr == 0,
+                            lighting.lut_scale.fr, LightingRegs::LightingSampler::Fresnel);
+
+            // Enabled for diffuse lighting alpha component
+            if (lighting.config0.fresnel_selector ==
+                    LightingRegs::LightingFresnelSelector::PrimaryAlpha ||
+                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
+                diffuse_sum.a() = lut_value;
+            }
+
+            // Enabled for the specular lighting alpha component
+            if (lighting.config0.fresnel_selector ==
+                    LightingRegs::LightingFresnelSelector::SecondaryAlpha ||
+                lighting.config0.fresnel_selector == LightingRegs::LightingFresnelSelector::Both) {
+                specular_sum.a() = lut_value;
+            }
+        }
+
+        auto dot_product = Math::Dot(light_vector, normal);
+
+        // Calculate clamp highlights before applying the two-sided diffuse configuration to the dot
+        // product.
+        float clamp_highlights = 1.0f;
+        if (lighting.config0.clamp_highlights) {
+            if (dot_product <= 0.0f)
+                clamp_highlights = 0.0f;
+            else
+                clamp_highlights = 1.0f;
+        }
+
+        if (light_config.config.two_sided_diffuse)
+            dot_product = std::abs(dot_product);
+        else
+            dot_product = std::max(dot_product, 0.0f);
+
+        if (light_config.config.geometric_factor_0 || light_config.config.geometric_factor_1) {
+            float geo_factor = half_vector.Length2();
+            geo_factor = geo_factor == 0.0f ? 0.0f : std::min(dot_product / geo_factor, 1.0f);
+            if (light_config.config.geometric_factor_0) {
+                specular_0 *= geo_factor;
+            }
+            if (light_config.config.geometric_factor_1) {
+                specular_1 *= geo_factor;
+            }
+        }
+
+        auto diffuse =
+            light_config.diffuse.ToVec3f() * dot_product + light_config.ambient.ToVec3f();
+        diffuse_sum += Math::MakeVec(diffuse * dist_atten * spot_atten, 0.0f);
+
+        specular_sum += Math::MakeVec(
+            (specular_0 + specular_1) * clamp_highlights * dist_atten * spot_atten, 0.0f);
+    }
+
+    diffuse_sum += Math::MakeVec(lighting.global_ambient.ToVec3f(), 0.0f);
+
+    auto diffuse = Math::MakeVec<float>(MathUtil::Clamp(diffuse_sum.x, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.y, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.z, 0.0f, 1.0f) * 255,
+                                        MathUtil::Clamp(diffuse_sum.w, 0.0f, 1.0f) * 255)
+                       .Cast<u8>();
+    auto specular = Math::MakeVec<float>(MathUtil::Clamp(specular_sum.x, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.y, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.z, 0.0f, 1.0f) * 255,
+                                         MathUtil::Clamp(specular_sum.w, 0.0f, 1.0f) * 255)
+                        .Cast<u8>();
+    return std::make_tuple(diffuse, specular);
+}
+
+} // namespace Pica
diff --git a/src/video_core/swrasterizer/lighting.h b/src/video_core/swrasterizer/lighting.h
new file mode 100644
index 000000000..d807a3d94
--- /dev/null
+++ b/src/video_core/swrasterizer/lighting.h
@@ -0,0 +1,19 @@
+// Copyright 2017 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <tuple>
+#include "common/quaternion.h"
+#include "common/vector_math.h"
+#include "video_core/pica_state.h"
+
+namespace Pica {
+
+std::tuple<Math::Vec4<u8>, Math::Vec4<u8>> ComputeFragmentsColors(
+    const Pica::LightingRegs& lighting, const Pica::State::Lighting& lighting_state,
+    const Math::Quaternion<float>& normquat, const Math::Vec3<float>& view,
+    const Math::Vec4<u8> (&texture_color)[4]);
+
+} // namespace Pica
diff --git a/src/video_core/swrasterizer/rasterizer.cpp b/src/video_core/swrasterizer/rasterizer.cpp
index 512e81c08..862135614 100644
--- a/src/video_core/swrasterizer/rasterizer.cpp
+++ b/src/video_core/swrasterizer/rasterizer.cpp
@@ -13,6 +13,7 @@
 #include "common/logging/log.h"
 #include "common/math_util.h"
 #include "common/microprofile.h"
+#include "common/quaternion.h"
 #include "common/vector_math.h"
 #include "core/hw/gpu.h"
 #include "core/memory.h"
@@ -24,6 +25,7 @@
 #include "video_core/regs_texturing.h"
 #include "video_core/shader/shader.h"
 #include "video_core/swrasterizer/framebuffer.h"
+#include "video_core/swrasterizer/lighting.h"
 #include "video_core/swrasterizer/proctex.h"
 #include "video_core/swrasterizer/rasterizer.h"
 #include "video_core/swrasterizer/texturing.h"
@@ -419,6 +421,26 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
                 regs.texturing.tev_combiner_buffer_color.a,
             };
 
+            Math::Vec4<u8> primary_fragment_color = {0, 0, 0, 0};
+            Math::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
+
+            if (!g_state.regs.lighting.disable) {
+                Math::Quaternion<float> normquat = Math::Quaternion<float>{
+                    {GetInterpolatedAttribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(),
+                     GetInterpolatedAttribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(),
+                     GetInterpolatedAttribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()},
+                    GetInterpolatedAttribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
+                }.Normalized();
+
+                Math::Vec3<float> view{
+                    GetInterpolatedAttribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
+                    GetInterpolatedAttribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
+                    GetInterpolatedAttribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
+                };
+                std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors(
+                    g_state.regs.lighting, g_state.lighting, normquat, view, texture_color);
+            }
+
             for (unsigned tev_stage_index = 0; tev_stage_index < tev_stages.size();
                  ++tev_stage_index) {
                 const auto& tev_stage = tev_stages[tev_stage_index];
@@ -427,14 +449,13 @@ static void ProcessTriangleInternal(const Vertex& v0, const Vertex& v1, const Ve
                 auto GetSource = [&](Source source) -> Math::Vec4<u8> {
                     switch (source) {
                     case Source::PrimaryColor:
+                        return primary_color;
 
-                    // HACK: Until we implement fragment lighting, use primary_color
                     case Source::PrimaryFragmentColor:
-                        return primary_color;
+                        return primary_fragment_color;
 
-                    // HACK: Until we implement fragment lighting, use zero
                     case Source::SecondaryFragmentColor:
-                        return {0, 0, 0, 0};
+                        return secondary_fragment_color;
 
                     case Source::Texture0:
                         return texture_color[0];
diff --git a/src/video_core/swrasterizer/rasterizer.h b/src/video_core/swrasterizer/rasterizer.h
index 2f0877581..66cd6cfd4 100644
--- a/src/video_core/swrasterizer/rasterizer.h
+++ b/src/video_core/swrasterizer/rasterizer.h
@@ -19,10 +19,9 @@ struct Vertex : Shader::OutputVertex {
 
     // Linear interpolation
     // factor: 0=this, 1=vtx
+    // Note: This function cannot be called after perspective divide
     void Lerp(float24 factor, const Vertex& vtx) {
         pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
-
-        // TODO: Should perform perspective correct interpolation here...
         quat = quat * factor + vtx.quat * (float24::FromFloat32(1) - factor);
         color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
         tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
@@ -30,12 +29,11 @@ struct Vertex : Shader::OutputVertex {
         tc0_w = tc0_w * factor + vtx.tc0_w * (float24::FromFloat32(1) - factor);
         view = view * factor + vtx.view * (float24::FromFloat32(1) - factor);
         tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor);
-
-        screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
     }
 
     // Linear interpolation
     // factor: 0=v0, 1=v1
+    // Note: This function cannot be called after perspective divide
     static Vertex Lerp(float24 factor, const Vertex& v0, const Vertex& v1) {
         Vertex ret = v0;
         ret.Lerp(factor, v1);
diff --git a/src/video_core/swrasterizer/texturing.cpp b/src/video_core/swrasterizer/texturing.cpp
index 4f02b93f2..79b1ce841 100644
--- a/src/video_core/swrasterizer/texturing.cpp
+++ b/src/video_core/swrasterizer/texturing.cpp
@@ -89,6 +89,8 @@ Math::Vec3<u8> GetColorModifier(TevStageConfig::ColorModifier factor,
     case ColorModifier::OneMinusSourceBlue:
         return (Math::Vec3<u8>(255, 255, 255) - values.bbb()).Cast<u8>();
     }
+
+    UNREACHABLE();
 };
 
 u8 GetAlphaModifier(TevStageConfig::AlphaModifier factor, const Math::Vec4<u8>& values) {
@@ -119,6 +121,8 @@ u8 GetAlphaModifier(TevStageConfig::AlphaModifier factor, const Math::Vec4<u8>&
     case AlphaModifier::OneMinusSourceBlue:
         return 255 - values.b();
     }
+
+    UNREACHABLE();
 };
 
 Math::Vec3<u8> ColorCombine(TevStageConfig::Operation op, const Math::Vec3<u8> input[3]) {
diff --git a/src/video_core/utils.h b/src/video_core/utils.h
index 7ce83a055..d8567f314 100644
--- a/src/video_core/utils.h
+++ b/src/video_core/utils.h
@@ -8,17 +8,11 @@
 
 namespace VideoCore {
 
-/**
- * Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
- * arranged in a Z-order curve. More details on the bit manipulation at:
- * https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
- */
+// 8x8 Z-Order coordinate from 2D coordinates
 static inline u32 MortonInterleave(u32 x, u32 y) {
-    u32 i = (x & 7) | ((y & 7) << 8); // ---- -210
-    i = (i ^ (i << 2)) & 0x1313;      // ---2 --10
-    i = (i ^ (i << 1)) & 0x1515;      // ---2 -1-0
-    i = (i | (i >> 7)) & 0x3F;
-    return i;
+    static const u32 xlut[] = {0x00, 0x01, 0x04, 0x05, 0x10, 0x11, 0x14, 0x15};
+    static const u32 ylut[] = {0x00, 0x02, 0x08, 0x0a, 0x20, 0x22, 0x28, 0x2a};
+    return xlut[x % 8] + ylut[y % 8];
 }
 
 /**