25 files changed, 1144 insertions, 609 deletions
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index fbe4047ce..1744066ba 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -1,8 +1,8 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <vector>
+#include <boost/container/static_vector.hpp>
 
 #include "clipper.h"
 #include "pica.h"
@@ -91,25 +91,31 @@ static void InitScreenCoordinates(OutputVertex& vtx)
     viewport.zscale     = float24::FromRawFloat24(registers.viewport_depth_range);
     viewport.offset_z   = float24::FromRawFloat24(registers.viewport_depth_far_plane);
 
+    float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w;
+    vtx.color *= inv_w;
+    vtx.tc0 *= inv_w;
+    vtx.tc1 *= inv_w;
+    vtx.tc2 *= inv_w;
+    vtx.pos.w = inv_w;
+
     // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
-    vtx.screenpos[0] = (vtx.pos.x / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
-    vtx.screenpos[1] = (vtx.pos.y / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
-    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z / vtx.pos.w * viewport.zscale;
+    vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
+    vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
+    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale;
 }
 
 void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
-
-    // TODO (neobrain):
-    // The list of output vertices has some fixed maximum size,
-    // however I haven't taken the time to figure out what it is exactly.
-    // For now, we hence just assume a maximal size of 1000 vertices.
-    const size_t max_vertices = 1000;
-    std::vector<OutputVertex> buffer_vertices;
-    std::vector<OutputVertex*> output_list{ &v0, &v1, &v2 };
-
-    // Make sure to reserve space for all vertices.
-    // Without this, buffer reallocation would invalidate references.
-    buffer_vertices.reserve(max_vertices);
+    using boost::container::static_vector;
+
+    // Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at
+    // the new edge (or less in degenerate cases). As such, we can say that each clipping plane
+    // introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a
+    // fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9.
+    static const size_t MAX_VERTICES = 9;
+    static_vector<OutputVertex, MAX_VERTICES> buffer_a = { v0, v1, v2 };
+    static_vector<OutputVertex, MAX_VERTICES> buffer_b;
+    auto* output_list = &buffer_a;
+    auto* input_list  = &buffer_b;
 
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
@@ -120,48 +126,45 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
                        ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
                        ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
 
-        const std::vector<OutputVertex*> input_list = output_list;
-        output_list.clear();
+        std::swap(input_list, output_list);
+        output_list->clear();
 
-        const OutputVertex* reference_vertex = input_list.back();
+        const OutputVertex* reference_vertex = &input_list->back();
 
-        for (const auto& vertex : input_list) {
+        for (const auto& vertex : *input_list) {
             // NOTE: This algorithm changes vertex order in some cases!
-            if (edge.IsInside(*vertex)) {
+            if (edge.IsInside(vertex)) {
                 if (edge.IsOutSide(*reference_vertex)) {
-                    buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
-                    output_list.push_back(&(buffer_vertices.back()));
+                    output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
                 }
 
-                output_list.push_back(vertex);
+                output_list->push_back(vertex);
             } else if (edge.IsInside(*reference_vertex)) {
-                buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
-                output_list.push_back(&(buffer_vertices.back()));
+                output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
             }
-
-            reference_vertex = vertex;
+            reference_vertex = &vertex;
         }
 
         // Need to have at least a full triangle to continue...
-        if (output_list.size() < 3)
+        if (output_list->size() < 3)
             return;
     }
 
-    InitScreenCoordinates(*(output_list[0]));
-    InitScreenCoordinates(*(output_list[1]));
+    InitScreenCoordinates((*output_list)[0]);
+    InitScreenCoordinates((*output_list)[1]);
 
-    for (size_t i = 0; i < output_list.size() - 2; i ++) {
-        OutputVertex& vtx0 = *(output_list[0]);
-        OutputVertex& vtx1 = *(output_list[i+1]);
-        OutputVertex& vtx2 = *(output_list[i+2]);
+    for (size_t i = 0; i < output_list->size() - 2; i ++) {
+        OutputVertex& vtx0 = (*output_list)[0];
+        OutputVertex& vtx1 = (*output_list)[i+1];
+        OutputVertex& vtx2 = (*output_list)[i+2];
 
         InitScreenCoordinates(vtx2);
 
-        DEBUG_LOG(GPU,
-                  "Triangle %lu/%lu (%lu buffer vertices) at position (%.3f, %.3f, %.3f, %.3f), "
+        LOG_TRACE(Render_Software,
+                  "Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), "
                   "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "
                   "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",
-                  i,output_list.size(), buffer_vertices.size(),
+                  i, output_list->size(),
                   vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),
                   vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
                   vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),
diff --git a/src/video_core/clipper.h b/src/video_core/clipper.h
index 14d31ca1e..19ce8e140 100644
--- a/src/video_core/clipper.h
+++ b/src/video_core/clipper.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 1ec727698..9602779f4 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #include "clipper.h"
@@ -8,6 +8,8 @@
 #include "pica.h"
 #include "primitive_assembly.h"
 #include "vertex_shader.h"
+#include "core/hle/service/gsp_gpu.h"
+#include "core/hw/gpu.h"
 
 #include "debug_utils/debug_utils.h"
 
@@ -30,24 +32,40 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
     if (id >= registers.NumIds())
         return;
 
+    // If we're skipping this frame, only allow trigger IRQ
+    if (GPU::g_skip_frame && id != PICA_REG_INDEX(trigger_irq))
+        return;
+
     // TODO: Figure out how register masking acts on e.g. vs_uniform_setup.set_value
     u32 old_value = registers[id];
     registers[id] = (old_value & ~mask) | (value & mask);
 
+    if (g_debug_context)
+        g_debug_context->OnEvent(DebugContext::Event::CommandLoaded, reinterpret_cast<void*>(&id));
+
     DebugUtils::OnPicaRegWrite(id, registers[id]);
 
     switch(id) {
+        // Trigger IRQ
+        case PICA_REG_INDEX(trigger_irq):
+            GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::P3D);
+            return;
+
         // It seems like these trigger vertex rendering
         case PICA_REG_INDEX(trigger_draw):
         case PICA_REG_INDEX(trigger_draw_indexed):
         {
             DebugUtils::DumpTevStageConfig(registers.GetTevStages());
 
+            if (g_debug_context)
+                g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
+
             const auto& attribute_config = registers.vertex_attributes;
-            const u8* const base_address = Memory::GetPointer(attribute_config.GetBaseAddress());
+            const u32 base_address = attribute_config.GetPhysicalBaseAddress();
 
             // Information about internal vertex attributes
-            const u8* vertex_attribute_sources[16];
+            u32 vertex_attribute_sources[16];
+            std::fill(vertex_attribute_sources, &vertex_attribute_sources[16], 0xdeadbeef);
             u32 vertex_attribute_strides[16];
             u32 vertex_attribute_formats[16];
             u32 vertex_attribute_elements[16];
@@ -57,10 +75,10 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             for (int loader = 0; loader < 12; ++loader) {
                 const auto& loader_config = attribute_config.attribute_loaders[loader];
 
-                const u8* load_address = base_address + loader_config.data_offset;
+                u32 load_address = base_address + loader_config.data_offset;
 
                 // TODO: What happens if a loader overwrites a previous one's data?
-                for (int component = 0; component < loader_config.component_count; ++component) {
+                for (unsigned component = 0; component < loader_config.component_count; ++component) {
                     u32 attribute_index = loader_config.GetComponent(component);
                     vertex_attribute_sources[attribute_index] = load_address;
                     vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count);
@@ -75,9 +93,9 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             bool is_indexed = (id == PICA_REG_INDEX(trigger_draw_indexed));
 
             const auto& index_info = registers.index_array;
-            const u8* index_address_8 = (u8*)base_address + index_info.offset;
+            const u8* index_address_8 = Memory::GetPointer(PAddrToVAddr(base_address + index_info.offset));
             const u16* index_address_16 = (u16*)index_address_8;
-            bool index_u16 = (bool)index_info.format;
+            bool index_u16 = index_info.format != 0;
 
             DebugUtils::GeometryDumper geometry_dumper;
             PrimitiveAssembler<VertexShader::OutputVertex> clipper_primitive_assembler(registers.triangle_topology.Value());
@@ -96,21 +114,31 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
                 for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
                     for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
-                        const u8* srcdata = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i];
+                        const u8* srcdata = Memory::GetPointer(PAddrToVAddr(vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]));
+
+                        // TODO(neobrain): Ocarina of Time 3D has GetNumTotalAttributes return 8,
+                        // yet only provides 2 valid source data addresses. Need to figure out
+                        // what's wrong there, until then we just continue when address lookup fails
+                        if (srcdata == nullptr)
+                            continue;
+
                         const float srcval = (vertex_attribute_formats[i] == 0) ? *(s8*)srcdata :
                                              (vertex_attribute_formats[i] == 1) ? *(u8*)srcdata :
                                              (vertex_attribute_formats[i] == 2) ? *(s16*)srcdata :
                                                                                   *(float*)srcdata;
                         input.attr[i][comp] = float24::FromFloat32(srcval);
-                        DEBUG_LOG(GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f",
+                        LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f",
                                   comp, i, vertex, index,
-                                  attribute_config.GetBaseAddress(),
+                                  attribute_config.GetPhysicalBaseAddress(),
                                   vertex_attribute_sources[i] - base_address,
-                                  srcdata - vertex_attribute_sources[i],
+                                  vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i],
                                   input.attr[i][comp].ToFloat32());
                     }
                 }
 
+                if (g_debug_context)
+                    g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
+
                 // NOTE: When dumping geometry, we simply assume that the first input attribute
                 //       corresponds to the position for now.
                 DebugUtils::GeometryDumper::Vertex dumped_vertex = {
@@ -132,9 +160,19 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 clipper_primitive_assembler.SubmitVertex(output, Clipper::ProcessTriangle);
             }
             geometry_dumper.Dump();
+
+            if (g_debug_context)
+                g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
+
             break;
         }
 
+        case PICA_REG_INDEX(vs_bool_uniforms):
+            for (unsigned i = 0; i < 16; ++i)
+                VertexShader::GetBoolUniform(i) = (registers.vs_bool_uniforms.Value() & (1 << i)) != 0;
+
+            break;
+
         case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[0], 0x2c1):
         case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[1], 0x2c2):
         case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[2], 0x2c3):
@@ -160,7 +198,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                 auto& uniform = VertexShader::GetFloatUniform(uniform_setup.index);
 
                 if (uniform_setup.index > 95) {
-                    ERROR_LOG(GPU, "Invalid VS uniform index %d", (int)uniform_setup.index);
+                    LOG_ERROR(HW_GPU, "Invalid VS uniform index %d", (int)uniform_setup.index);
                     break;
                 }
 
@@ -176,7 +214,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
                     uniform.x = float24::FromRawFloat24(uniform_write_buffer[2] & 0xFFFFFF);
                 }
 
-                DEBUG_LOG(GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index,
+                LOG_TRACE(HW_GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index,
                           uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(),
                           uniform.w.ToFloat32());
 
@@ -229,6 +267,9 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         default:
             break;
     }
+
+    if (g_debug_context)
+        g_debug_context->OnEvent(DebugContext::Event::CommandProcessed, reinterpret_cast<void*>(&id));
 }
 
 static std::ptrdiff_t ExecuteCommandBlock(const u32* first_command_word) {
@@ -259,8 +300,9 @@ static std::ptrdiff_t ExecuteCommandBlock(const u32* first_command_word) {
 
 void ProcessCommandList(const u32* list, u32 size) {
     u32* read_pointer = (u32*)list;
+    u32 list_length = size / sizeof(u32);
 
-    while (read_pointer < list + size) {
+    while (read_pointer < list + list_length) {
         read_pointer += ExecuteCommandBlock(read_pointer);
     }
 }
diff --git a/src/video_core/command_processor.h b/src/video_core/command_processor.h
index 955f9daec..bb3d4150f 100644
--- a/src/video_core/command_processor.h
+++ b/src/video_core/command_processor.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index 275b06b7c..5921185a6 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -3,6 +3,8 @@
 // Refer to the license.txt file included.
 
 #include <algorithm>
+#include <condition_variable>
+#include <list>
 #include <map>
 #include <fstream>
 #include <mutex>
@@ -12,14 +14,62 @@
 #include <png.h>
 #endif
 
+#include <nihstro/shader_binary.h>
+
+#include "common/log.h"
 #include "common/file_util.h"
 
+#include "video_core/math.h"
 #include "video_core/pica.h"
 
 #include "debug_utils.h"
 
+using nihstro::DVLBHeader;
+using nihstro::DVLEHeader;
+using nihstro::DVLPHeader;
+
 namespace Pica {
 
+void DebugContext::OnEvent(Event event, void* data) {
+    if (!breakpoints[event].enabled)
+        return;
+
+    {
+        std::unique_lock<std::mutex> lock(breakpoint_mutex);
+
+        // TODO: Should stop the CPU thread here once we multithread emulation.
+
+        active_breakpoint = event;
+        at_breakpoint = true;
+
+        // Tell all observers that we hit a breakpoint
+        for (auto& breakpoint_observer : breakpoint_observers) {
+            breakpoint_observer->OnPicaBreakPointHit(event, data);
+        }
+
+        // Wait until another thread tells us to Resume()
+        resume_from_breakpoint.wait(lock, [&]{ return !at_breakpoint; });
+    }
+}
+
+void DebugContext::Resume() {
+    {
+        std::unique_lock<std::mutex> lock(breakpoint_mutex);
+
+        // Tell all observers that we are about to resume
+        for (auto& breakpoint_observer : breakpoint_observers) {
+            breakpoint_observer->OnPicaResume();
+        }
+
+        // Resume the waiting thread (i.e. OnEvent())
+        at_breakpoint = false;
+    }
+
+    resume_from_breakpoint.notify_one();
+}
+
+std::shared_ptr<DebugContext> g_debug_context; // TODO: Get rid of this global
+
 namespace DebugUtils {
 
 void GeometryDumper::AddTriangle(Vertex& v0, Vertex& v1, Vertex& v2) {
@@ -54,65 +104,6 @@ void GeometryDumper::Dump() {
     }
 }
 
-#pragma pack(1)
-struct DVLBHeader {
-    enum : u32 {
-        MAGIC_WORD = 0x424C5644, // "DVLB"
-    };
-
-    u32 magic_word;
-    u32 num_programs;
-//    u32 dvle_offset_table[];
-};
-static_assert(sizeof(DVLBHeader) == 0x8, "Incorrect structure size");
-
-struct DVLPHeader {
-    enum : u32 {
-        MAGIC_WORD = 0x504C5644, // "DVLP"
-    };
-
-    u32 magic_word;
-    u32 version;
-    u32 binary_offset;  // relative to DVLP start
-    u32 binary_size_words;
-    u32 swizzle_patterns_offset;
-    u32 swizzle_patterns_num_entries;
-    u32 unk2;
-};
-static_assert(sizeof(DVLPHeader) == 0x1C, "Incorrect structure size");
-
-struct DVLEHeader {
-    enum : u32 {
-        MAGIC_WORD = 0x454c5644, // "DVLE"
-    };
-
-    enum class ShaderType : u8 {
-        VERTEX = 0,
-        GEOMETRY = 1,
-    };
-
-    u32 magic_word;
-    u16 pad1;
-    ShaderType type;
-    u8 pad2;
-    u32 main_offset_words; // offset within binary blob
-    u32 endmain_offset_words;
-    u32 pad3;
-    u32 pad4;
-    u32 constant_table_offset;
-    u32 constant_table_size; // number of entries
-    u32 label_table_offset;
-    u32 label_table_size;
-    u32 output_register_table_offset;
-    u32 output_register_table_size;
-    u32 uniform_table_offset;
-    u32 uniform_table_size;
-    u32 symbol_table_offset;
-    u32 symbol_table_size;
-
-};
-static_assert(sizeof(DVLEHeader) == 0x40, "Incorrect structure size");
-#pragma pack()
 
 void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data, u32 swizzle_size,
                 u32 main_offset, const Regs::VSOutputAttributes* output_attributes)
@@ -155,7 +146,7 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data
 
     // This is put into a try-catch block to make sure we notice unknown configurations.
     std::vector<OutputRegisterInfo> output_info_table;
-        for (int i = 0; i < 7; ++i) {
+        for (unsigned i = 0; i < 7; ++i) {
             using OutputAttributes = Pica::Regs::VSOutputAttributes;
 
             // TODO: It's still unclear how the attribute components map to the register!
@@ -204,8 +195,8 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data
                         it->component_mask = it->component_mask | component_mask;
                     }
                 } catch (const std::out_of_range& ) {
-                    _dbg_assert_msg_(GPU, 0, "Unknown output attribute mapping");
-                    ERROR_LOG(GPU, "Unknown output attribute mapping: %03x, %03x, %03x, %03x",
+                    _dbg_assert_msg_(HW_GPU, 0, "Unknown output attribute mapping");
+                    LOG_ERROR(HW_GPU, "Unknown output attribute mapping: %03x, %03x, %03x, %03x",
                               (int)output_attributes[i].map_x.Value(),
                               (int)output_attributes[i].map_y.Value(),
                               (int)output_attributes[i].map_z.Value(),
@@ -232,8 +223,8 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data
     dvlp.binary_size_words = binary_size;
     QueueForWriting((u8*)binary_data, binary_size * sizeof(u32));
 
-    dvlp.swizzle_patterns_offset = write_offset - dvlp_offset;
-    dvlp.swizzle_patterns_num_entries = swizzle_size;
+    dvlp.swizzle_info_offset = write_offset - dvlp_offset;
+    dvlp.swizzle_info_num_entries = swizzle_size;
     u32 dummy = 0;
     for (unsigned int i = 0; i < swizzle_size; ++i) {
         QueueForWriting((u8*)&swizzle_data[i], sizeof(swizzle_data[i]));
@@ -265,7 +256,7 @@ static int is_pica_tracing = false;
 void StartPicaTracing()
 {
     if (is_pica_tracing) {
-        ERROR_LOG(GPU, "StartPicaTracing called even though tracing already running!");
+        LOG_WARNING(HW_GPU, "StartPicaTracing called even though tracing already running!");
         return;
     }
 
@@ -298,7 +289,7 @@ void OnPicaRegWrite(u32 id, u32 value)
 std::unique_ptr<PicaTrace> FinishPicaTracing()
 {
     if (!is_pica_tracing) {
-        ERROR_LOG(GPU, "FinishPicaTracing called even though tracing already running!");
+        LOG_WARNING(HW_GPU, "FinishPicaTracing called even though tracing isn't running!");
         return {};
     }
 
@@ -312,6 +303,173 @@ std::unique_ptr<PicaTrace> FinishPicaTracing()
     return std::move(ret);
 }
 
+const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const TextureInfo& info, bool disable_alpha) {
+    // Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each
+    // of which is composed of four 2x2 subtiles each of which is composed of four texels.
+    // Each structure is embedded into the next-bigger one in a diagonal pattern, e.g.
+    // texels are laid out in a 2x2 subtile like this:
+    // 2 3
+    // 0 1
+    //
+    // The full 8x8 tile has the texels arranged like this:
+    //
+    // 42 43 46 47 58 59 62 63
+    // 40 41 44 45 56 57 60 61
+    // 34 35 38 39 50 51 54 55
+    // 32 33 36 37 48 49 52 53
+    // 10 11 14 15 26 27 30 31
+    // 08 09 12 13 24 25 28 29
+    // 02 03 06 07 18 19 22 23
+    // 00 01 04 05 16 17 20 21
+
+    const unsigned int block_width = 8;
+    const unsigned int block_height = 8;
+
+    const unsigned int coarse_x = x & ~7;
+    const unsigned int coarse_y = y & ~7;
+
+    // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
+    // arranged in a Z-order curve. More details on the bit manipulation at:
+    // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+    unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
+    i = (i ^ (i << 2)) & 0x1313;              // ---2 --10
+    i = (i ^ (i << 1)) & 0x1515;              // ---2 -1-0
+    i = (i | (i >> 7)) & 0x3F;
+
+    source += coarse_y * info.stride;
+    const unsigned int offset = coarse_x * block_height + i;
+
+    switch (info.format) {
+    case Regs::TextureFormat::RGBA8:
+    {
+        const u8* source_ptr = source + offset * 4;
+        return { source_ptr[3], source_ptr[2], source_ptr[1], disable_alpha ? (u8)255 : source_ptr[0] };
+    }
+
+    case Regs::TextureFormat::RGB8:
+    {
+        const u8* source_ptr = source + offset * 3;
+        return { source_ptr[2], source_ptr[1], source_ptr[0], 255 };
+    }
+
+    case Regs::TextureFormat::RGBA5551:
+    {
+        const u16 source_ptr = *(const u16*)(source + offset * 2);
+        u8 r = (source_ptr >> 11) & 0x1F;
+        u8 g = ((source_ptr) >> 6) & 0x1F;
+        u8 b = (source_ptr >> 1) & 0x1F;
+        u8 a = source_ptr & 1;
+        return Math::MakeVec<u8>((r << 3) | (r >> 2), (g << 3) | (g >> 2), (b << 3) | (b >> 2), disable_alpha ? 255 : (a * 255));
+    }
+
+    case Regs::TextureFormat::RGB565:
+    {
+        const u16 source_ptr = *(const u16*)(source + offset * 2);
+        u8 r = (source_ptr >> 11) & 0x1F;
+        u8 g = ((source_ptr) >> 5) & 0x3F;
+        u8 b = (source_ptr) & 0x1F;
+        return Math::MakeVec<u8>((r << 3) | (r >> 2), (g << 2) | (g >> 4), (b << 3) | (b >> 2), 255);
+    }
+
+    case Regs::TextureFormat::RGBA4:
+    {
+        const u8* source_ptr = source + offset * 2;
+        u8 r = source_ptr[1] >> 4;
+        u8 g = source_ptr[1] & 0xFF;
+        u8 b = source_ptr[0] >> 4;
+        u8 a = source_ptr[0] & 0xFF;
+        r = (r << 4) | r;
+        g = (g << 4) | g;
+        b = (b << 4) | b;
+        a = (a << 4) | a;
+        return { r, g, b, disable_alpha ? (u8)255 : a };
+    }
+
+    case Regs::TextureFormat::IA8:
+    {
+        const u8* source_ptr = source + offset * 2;
+
+        // TODO: component order not verified
+
+        if (disable_alpha) {
+            // Show intensity as red, alpha as green
+            return { source_ptr[0], source_ptr[1], 0, 255 };
+        } else {
+            return { source_ptr[0], source_ptr[0], source_ptr[0], source_ptr[1]};
+        }
+    }
+
+    case Regs::TextureFormat::I8:
+    {
+        const u8* source_ptr = source + offset;
+        return { *source_ptr, *source_ptr, *source_ptr, 255 };
+    }
+
+    case Regs::TextureFormat::A8:
+    {
+        const u8* source_ptr = source + offset;
+
+        if (disable_alpha) {
+            return { *source_ptr, *source_ptr, *source_ptr, 255 };
+        } else {
+            return { 0, 0, 0, *source_ptr };
+        }
+    }
+
+    case Regs::TextureFormat::IA4:
+    {
+        const u8* source_ptr = source + offset / 2;
+
+        // TODO: component order not verified
+
+        u8 i = (*source_ptr) & 0xF;
+        u8 a = ((*source_ptr) & 0xF0) >> 4;
+        a |= a << 4;
+        i |= i << 4;
+
+        if (disable_alpha) {
+            // Show intensity as red, alpha as green
+            return { i, a, 0, 255 };
+        } else {
+            return { i, i, i, a };
+        }
+    }
+
+    case Regs::TextureFormat::A4:
+    {
+        const u8* source_ptr = source + offset / 2;
+
+        // TODO: component order not verified
+
+        u8 a = (coarse_x % 2) ? ((*source_ptr)&0xF) : (((*source_ptr) & 0xF0) >> 4);
+        a |= a << 4;
+
+        if (disable_alpha) {
+            return { *source_ptr, *source_ptr, *source_ptr, 255 };
+        } else {
+            return { 0, 0, 0, *source_ptr };
+        }
+    }
+
+    default:
+        LOG_ERROR(HW_GPU, "Unknown texture format: %x", (u32)info.format);
+        _dbg_assert_(HW_GPU, 0);
+        return {};
+    }
+}
+
+TextureInfo TextureInfo::FromPicaRegister(const Regs::TextureConfig& config,
+                                          const Regs::TextureFormat& format)
+{
+    TextureInfo info;
+    info.physical_address = config.GetPhysicalAddress();
+    info.width = config.width;
+    info.height = config.height;
+    info.format = format;
+    info.stride = Pica::Regs::NibblesPerPixel(info.format) * info.width / 2;
+    return info;
+}
+
 void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) {
     // NOTE: Permanently enabling this just trashes hard disks for no reason.
     //       Hence, this is currently disabled.
@@ -341,7 +499,7 @@ void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) {
     // Initialize write structure
     png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, nullptr);
     if (png_ptr == nullptr) {
-        ERROR_LOG(GPU, "Could not allocate write struct\n");
+        LOG_ERROR(Debug_GPU, "Could not allocate write struct\n");
         goto finalise;
 
     }
@@ -349,13 +507,13 @@ void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) {
     // Initialize info structure
     info_ptr = png_create_info_struct(png_ptr);
     if (info_ptr == nullptr) {
-        ERROR_LOG(GPU, "Could not allocate info struct\n");
+        LOG_ERROR(Debug_GPU, "Could not allocate info struct\n");
         goto finalise;
     }
 
     // Setup Exception handling
     if (setjmp(png_jmpbuf(png_ptr))) {
-        ERROR_LOG(GPU, "Error during png creation\n");
+        LOG_ERROR(Debug_GPU, "Error during png creation\n");
         goto finalise;
     }
 
@@ -375,34 +533,22 @@ void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) {
     png_write_info(png_ptr, info_ptr);
 
     buf = new u8[row_stride * texture_config.height];
-    for (int y = 0; y < texture_config.height; ++y) {
-        for (int x = 0; x < texture_config.width; ++x) {
-            // Cf. rasterizer code for an explanation of this algorithm.
-            int texel_index_within_tile = 0;
-            for (int block_size_index = 0; block_size_index < 3; ++block_size_index) {
-                int sub_tile_width = 1 << block_size_index;
-                int sub_tile_height = 1 << block_size_index;
-
-                int sub_tile_index = (x & sub_tile_width) << block_size_index;
-                sub_tile_index += 2 * ((y & sub_tile_height) << block_size_index);
-                texel_index_within_tile += sub_tile_index;
-            }
-
-            const int block_width = 8;
-            const int block_height = 8;
-
-            int coarse_x = (x / block_width) * block_width;
-            int coarse_y = (y / block_height) * block_height;
-
-            u8* source_ptr = (u8*)data + coarse_x * block_height * 3 + coarse_y * row_stride + texel_index_within_tile * 3;
-            buf[3 * x + y * row_stride    ] = source_ptr[2];
-            buf[3 * x + y * row_stride + 1] = source_ptr[1];
-            buf[3 * x + y * row_stride + 2] = source_ptr[0];
+    for (unsigned y = 0; y < texture_config.height; ++y) {
+        for (unsigned x = 0; x < texture_config.width; ++x) {
+            TextureInfo info;
+            info.width = texture_config.width;
+            info.height = texture_config.height;
+            info.stride = row_stride;
+            info.format = registers.texture0_format;
+            Math::Vec4<u8> texture_color = LookupTexture(data, x, y, info);
+            buf[3 * x + y * row_stride    ] = texture_color.r();
+            buf[3 * x + y * row_stride + 1] = texture_color.g();
+            buf[3 * x + y * row_stride + 2] = texture_color.b();
         }
     }
 
     // Write image data
-    for (auto y = 0; y < texture_config.height; ++y)
+    for (unsigned y = 0; y < texture_config.height; ++y)
     {
         u8* row_ptr = (u8*)buf + y * row_stride;
         u8* ptr = row_ptr;
@@ -431,26 +577,32 @@ void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages)
     for (size_t index = 0; index < stages.size(); ++index) {
         const auto& tev_stage = stages[index];
 
-        const std::map<Source, std::string> source_map = {
+        static const std::map<Source, std::string> source_map = {
             { Source::PrimaryColor, "PrimaryColor" },
             { Source::Texture0, "Texture0" },
+            { Source::Texture1, "Texture1" },
+            { Source::Texture2, "Texture2" },
             { Source::Constant, "Constant" },
             { Source::Previous, "Previous" },
         };
 
-        const std::map<ColorModifier, std::string> color_modifier_map = {
-            { ColorModifier::SourceColor, { "%source.rgb" } }
+        static const std::map<ColorModifier, std::string> color_modifier_map = {
+            { ColorModifier::SourceColor, { "%source.rgb" } },
+            { ColorModifier::SourceAlpha, { "%source.aaa" } },
         };
-        const std::map<AlphaModifier, std::string> alpha_modifier_map = {
-            { AlphaModifier::SourceAlpha, "%source.a" }
+        static const std::map<AlphaModifier, std::string> alpha_modifier_map = {
+            { AlphaModifier::SourceAlpha, "%source.a" },
+            { AlphaModifier::OneMinusSourceAlpha, "(255 - %source.a)" },
         };
 
-        std::map<Operation, std::string> combiner_map = {
+        static const std::map<Operation, std::string> combiner_map = {
             { Operation::Replace, "%source1" },
             { Operation::Modulate, "(%source1 * %source2) / 255" },
+            { Operation::Add, "(%source1 + %source2)" },
+            { Operation::Lerp, "lerp(%source1, %source2, %source3)" },
         };
 
-        auto ReplacePattern =
+        static auto ReplacePattern =
                 [](const std::string& input, const std::string& pattern, const std::string& replacement) -> std::string {
                     size_t start = input.find(pattern);
                     if (start == std::string::npos)
@@ -460,8 +612,8 @@ void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages)
                     ret.replace(start, pattern.length(), replacement);
                     return ret;
                 };
-        auto GetColorSourceStr =
-                [&source_map,&color_modifier_map,&ReplacePattern](const Source& src, const ColorModifier& modifier) {
+        static auto GetColorSourceStr =
+                [](const Source& src, const ColorModifier& modifier) {
                     auto src_it = source_map.find(src);
                     std::string src_str = "Unknown";
                     if (src_it != source_map.end())
@@ -474,8 +626,8 @@ void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages)
 
                     return ReplacePattern(modifier_str, "%source", src_str);
                 };
-        auto GetColorCombinerStr =
-                [&](const Regs::TevStageConfig& tev_stage) {
+        static auto GetColorCombinerStr =
+                [](const Regs::TevStageConfig& tev_stage) {
                     auto op_it = combiner_map.find(tev_stage.color_op);
                     std::string op_str = "Unknown op (%source1, %source2, %source3)";
                     if (op_it != combiner_map.end())
@@ -485,8 +637,8 @@ void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages)
                     op_str = ReplacePattern(op_str, "%source2", GetColorSourceStr(tev_stage.color_source2, tev_stage.color_modifier2));
                     return   ReplacePattern(op_str, "%source3", GetColorSourceStr(tev_stage.color_source3, tev_stage.color_modifier3));
                 };
-        auto GetAlphaSourceStr =
-                [&source_map,&alpha_modifier_map,&ReplacePattern](const Source& src, const AlphaModifier& modifier) {
+        static auto GetAlphaSourceStr =
+                [](const Source& src, const AlphaModifier& modifier) {
                     auto src_it = source_map.find(src);
                     std::string src_str = "Unknown";
                     if (src_it != source_map.end())
@@ -499,8 +651,8 @@ void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages)
 
                     return ReplacePattern(modifier_str, "%source", src_str);
                 };
-        auto GetAlphaCombinerStr =
-                [&](const Regs::TevStageConfig& tev_stage) {
+        static auto GetAlphaCombinerStr =
+                [](const Regs::TevStageConfig& tev_stage) {
                     auto op_it = combiner_map.find(tev_stage.alpha_op);
                     std::string op_str = "Unknown op (%source1, %source2, %source3)";
                     if (op_it != combiner_map.end())
@@ -514,7 +666,7 @@ void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages)
         stage_info += "Stage " + std::to_string(index) + ": " + GetColorCombinerStr(tev_stage) + "   " + GetAlphaCombinerStr(tev_stage) + "\n";
     }
 
-    DEBUG_LOG(GPU, "%s", stage_info.c_str());
+    LOG_TRACE(HW_GPU, "%s", stage_info.c_str());
 }
 
 } // namespace
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index b1558cfae..f361a5385 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -5,13 +5,148 @@
 #pragma once
 
 #include <array>
+#include <condition_variable>
+#include <list>
+#include <map>
 #include <memory>
+#include <mutex>
 #include <vector>
 
+#include "video_core/math.h"
 #include "video_core/pica.h"
 
 namespace Pica {
 
+class DebugContext {
+public:
+    enum class Event {
+        FirstEvent = 0,
+
+        CommandLoaded = FirstEvent,
+        CommandProcessed,
+        IncomingPrimitiveBatch,
+        FinishedPrimitiveBatch,
+        VertexLoaded,
+
+        NumEvents
+    };
+
+    /**
+     * Inherit from this class to be notified of events registered to some debug context.
+     * Most importantly this is used for our debugger GUI.
+     *
+     * To implement event handling, override the OnPicaBreakPointHit and OnPicaResume methods.
+     * @warning All BreakPointObservers need to be on the same thread to guarantee thread-safe state access
+     * @todo Evaluate an alternative interface, in which there is only one managing observer and multiple child observers running (by design) on the same thread.
+     */
+    class BreakPointObserver {
+    public:
+        /// Constructs the object such that it observes events of the given DebugContext.
+        BreakPointObserver(std::shared_ptr<DebugContext> debug_context) : context_weak(debug_context) {
+            std::unique_lock<std::mutex> lock(debug_context->breakpoint_mutex);
+            debug_context->breakpoint_observers.push_back(this);
+        }
+
+        virtual ~BreakPointObserver() {
+            auto context = context_weak.lock();
+            if (context) {
+                std::unique_lock<std::mutex> lock(context->breakpoint_mutex);
+                context->breakpoint_observers.remove(this);
+
+                // If we are the last observer to be destroyed, tell the debugger context that
+                // it is free to continue. In particular, this is required for a proper Citra
+                // shutdown, when the emulation thread is waiting at a breakpoint.
+                if (context->breakpoint_observers.empty())
+                    context->Resume();
+            }
+        }
+
+        /**
+         * Action to perform when a breakpoint was reached.
+         * @param event Type of event which triggered the breakpoint
+         * @param data Optional data pointer (if unused, this is a nullptr)
+         * @note This function will perform nothing unless it is overridden in the child class.
+         */
+        virtual void OnPicaBreakPointHit(Event, void*) {
+        }
+
+        /**
+         * Action to perform when emulation is resumed from a breakpoint.
+         * @note This function will perform nothing unless it is overridden in the child class.
+         */
+        virtual void OnPicaResume() {
+        }
+
+    protected:
+        /**
+         * Weak context pointer. This need not be valid, so when requesting a shared_ptr via
+         * context_weak.lock(), always compare the result against nullptr.
+         */
+        std::weak_ptr<DebugContext> context_weak;
+    };
+
+    /**
+     * Simple structure defining a breakpoint state
+     */
+    struct BreakPoint {
+        bool enabled = false;
+    };
+
+    /**
+     * Static constructor used to create a shared_ptr of a DebugContext.
+     */
+    static std::shared_ptr<DebugContext> Construct() {
+        return std::shared_ptr<DebugContext>(new DebugContext);
+    }
+
+    /**
+     * Used by the emulation core when a given event has happened. If a breakpoint has been set
+     * for this event, OnEvent calls the event handlers of the registered breakpoint observers.
+     * The current thread then is halted until Resume() is called from another thread (or until
+     * emulation is stopped).
+     * @param event Event which has happened
+     * @param data Optional data pointer (pass nullptr if unused). Needs to remain valid until Resume() is called.
+     */
+    void OnEvent(Event event, void* data);
+
+    /**
+     * Resume from the current breakpoint.
+     * @warning Calling this from the same thread that OnEvent was called in will cause a deadlock. Calling from any other thread is safe.
+     */
+    void Resume();
+
+    /**
+     * Delete all set breakpoints and resume emulation.
+     */
+    void ClearBreakpoints() {
+        breakpoints.clear();
+        Resume();
+    }
+
+    // TODO: Evaluate if access to these members should be hidden behind a public interface.
+    std::map<Event, BreakPoint> breakpoints;
+    Event active_breakpoint;
+    bool at_breakpoint = false;
+
+private:
+    /**
+     * Private default constructor to make sure people always construct this through Construct()
+     * instead.
+     */
+    DebugContext() = default;
+
+    /// Mutex protecting current breakpoint state and the observer list.
+    std::mutex breakpoint_mutex;
+
+    /// Used by OnEvent to wait for resumption.
+    std::condition_variable resume_from_breakpoint;
+
+    /// List of registered observers
+    std::list<BreakPointObserver*> breakpoint_observers;
+};
+
+extern std::shared_ptr<DebugContext> g_debug_context; // TODO: Get rid of this global
+
 namespace DebugUtils {
 
 // Simple utility class for dumping geometry data to an OBJ file
@@ -57,6 +192,28 @@ bool IsPicaTracing();
 void OnPicaRegWrite(u32 id, u32 value);
 std::unique_ptr<PicaTrace> FinishPicaTracing();
 
+struct TextureInfo {
+    PAddr physical_address;
+    int width;
+    int height;
+    int stride;
+    Pica::Regs::TextureFormat format;
+
+    static TextureInfo FromPicaRegister(const Pica::Regs::TextureConfig& config,
+                                        const Pica::Regs::TextureFormat& format);
+};
+
+/**
+ * Lookup texel located at the given coordinates and return an RGBA vector of its color.
+ * @param source Source pointer to read data from
+ * @param s,t Texture coordinates to read from
+ * @param info TextureInfo object describing the texture setup
+ * @param disable_alpha This is used for debug widgets which use this method to display textures without providing a good way to visualize alpha by themselves. If true, this will return 255 for the alpha component, and either drop the information entirely or store it in an "unused" color channel.
+ * @todo Eventually we should get rid of the disable_alpha parameter.
+ */
+const Math::Vec4<u8> LookupTexture(const u8* source, int s, int t, const TextureInfo& info,
+                                   bool disable_alpha = false);
+
 void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data);
 
 void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages);
diff --git a/src/video_core/gpu_debugger.h b/src/video_core/gpu_debugger.h
index 1242eb58f..a51d49c92 100644
--- a/src/video_core/gpu_debugger.h
+++ b/src/video_core/gpu_debugger.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
@@ -39,7 +39,7 @@ public:
         virtual void GXCommandProcessed(int total_command_count)
         {
             const GSP_GPU::Command& cmd = observed->ReadGXCommandHistory(total_command_count-1);
-            ERROR_LOG(GSP, "Received command: id=%x", (int)cmd.id.Value());
+            LOG_TRACE(Debug_GPU, "Received command: id=%x", (int)cmd.id.Value());
         }
 
     protected:
@@ -85,7 +85,7 @@ public:
 
     void UnregisterObserver(DebuggerObserver* observer)
     {
-        std::remove(observers.begin(), observers.end(), observer);
+        observers.erase(std::remove(observers.begin(), observers.end(), observer), observers.end());
         observer->observed = nullptr;
     }
 
diff --git a/src/video_core/math.h b/src/video_core/math.h
index 83ba81235..9622e7614 100644
--- a/src/video_core/math.h
+++ b/src/video_core/math.h
@@ -1,4 +1,4 @@
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 5fe15a218..38bac748c 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
@@ -8,6 +8,7 @@
 #include <cstddef>
 #include <initializer_list>
 #include <map>
+#include <vector>
 
 #include "common/bit_field.h"
 #include "common/common_types.h"
@@ -45,10 +46,16 @@ struct Regs {
 #define INSERT_PADDING_WORDS_HELPER2(x, y) INSERT_PADDING_WORDS_HELPER1(x, y)
 #define INSERT_PADDING_WORDS(num_words) u32 INSERT_PADDING_WORDS_HELPER2(pad, __LINE__)[(num_words)];
 
-    INSERT_PADDING_WORDS(0x41);
+    INSERT_PADDING_WORDS(0x10);
+
+    u32 trigger_irq;
+
+    INSERT_PADDING_WORDS(0x30);
 
     BitField<0, 24, u32> viewport_size_x;
+
     INSERT_PADDING_WORDS(0x1);
+
     BitField<0, 24, u32> viewport_size_y;
 
     INSERT_PADDING_WORDS(0x9);
@@ -98,6 +105,11 @@ struct Regs {
     INSERT_PADDING_WORDS(0x17);
 
     struct TextureConfig {
+        enum WrapMode : u32 {
+            ClampToEdge = 0,
+            Repeat      = 2,
+        };
+
         INSERT_PADDING_WORDS(0x1);
 
         union {
@@ -105,12 +117,17 @@ struct Regs {
             BitField<16, 16, u32> width;
         };
 
-        INSERT_PADDING_WORDS(0x2);
+        union {
+            BitField< 8, 2, WrapMode> wrap_s;
+            BitField<11, 2, WrapMode> wrap_t;
+        };
+
+        INSERT_PADDING_WORDS(0x1);
 
         u32 address;
 
-        u32 GetPhysicalAddress() {
-            return DecodeAddressRegister(address) - Memory::FCRAM_PADDR + Memory::HEAP_GSP_VADDR;
+        u32 GetPhysicalAddress() const {
+            return DecodeAddressRegister(address);
         }
 
         // texture1 and texture2 store the texture format directly after the address
@@ -125,17 +142,70 @@ struct Regs {
         RGBA5551     =  2,
         RGB565       =  3,
         RGBA4        =  4,
+        IA8          =  5,
+
+        I8           =  7,
+        A8           =  8,
+        IA4          =  9,
 
+        A4           = 11,
         // TODO: Support for the other formats is not implemented, yet.
         // Seems like they are luminance formats and compressed textures.
     };
 
-    BitField<0, 1, u32> texturing_enable;
+    static unsigned NibblesPerPixel(TextureFormat format) {
+        switch (format) {
+        case TextureFormat::RGBA8:
+            return 8;
+
+        case TextureFormat::RGB8:
+            return 6;
+
+        case TextureFormat::RGBA5551:
+        case TextureFormat::RGB565:
+        case TextureFormat::RGBA4:
+        case TextureFormat::IA8:
+            return 4;
+
+        case TextureFormat::A4:
+            return 1;
+
+        case TextureFormat::I8:
+        case TextureFormat::A8:
+        case TextureFormat::IA4:
+        default:  // placeholder for yet unknown formats
+            return 2;
+        }
+    }
+
+    union {
+        BitField< 0, 1, u32> texture0_enable;
+        BitField< 1, 1, u32> texture1_enable;
+        BitField< 2, 1, u32> texture2_enable;
+    };
     TextureConfig texture0;
     INSERT_PADDING_WORDS(0x8);
     BitField<0, 4, TextureFormat> texture0_format;
-
-    INSERT_PADDING_WORDS(0x31);
+    INSERT_PADDING_WORDS(0x2);
+    TextureConfig texture1;
+    BitField<0, 4, TextureFormat> texture1_format;
+    INSERT_PADDING_WORDS(0x2);
+    TextureConfig texture2;
+    BitField<0, 4, TextureFormat> texture2_format;
+    INSERT_PADDING_WORDS(0x21);
+
+    struct FullTextureConfig {
+        const bool enabled;
+        const TextureConfig config;
+        const TextureFormat format;
+    };
+    const std::array<FullTextureConfig, 3> GetTextures() const {
+        return {{
+                   { texture0_enable.ToBool(), texture0, texture0_format },
+                   { texture1_enable.ToBool(), texture1, texture1_format },
+                   { texture2_enable.ToBool(), texture2, texture2_format }
+               }};
+    }
 
     // 0xc0-0xff: Texture Combiner (akin to glTexEnv)
     struct TevStageConfig {
@@ -257,11 +327,11 @@ struct Regs {
 
         INSERT_PADDING_WORDS(0x1);
 
-        inline u32 GetColorBufferAddress() const {
-            return Memory::PhysicalToVirtualAddress(DecodeAddressRegister(color_buffer_address));
+        inline u32 GetColorBufferPhysicalAddress() const {
+            return DecodeAddressRegister(color_buffer_address);
         }
-        inline u32 GetDepthBufferAddress() const {
-            return Memory::PhysicalToVirtualAddress(DecodeAddressRegister(depth_buffer_address));
+        inline u32 GetDepthBufferPhysicalAddress() const {
+            return DecodeAddressRegister(depth_buffer_address);
         }
 
         inline u32 GetWidth() const {
@@ -285,9 +355,8 @@ struct Regs {
 
         BitField<0, 29, u32> base_address;
 
-        inline u32 GetBaseAddress() const {
-            // TODO: Ugly, should fix PhysicalToVirtualAddress instead
-            return DecodeAddressRegister(base_address) - Memory::FCRAM_PADDR + Memory::HEAP_GSP_VADDR;
+        u32 GetPhysicalBaseAddress() const {
+            return DecodeAddressRegister(base_address);
         }
 
         // Descriptor for internal vertex attributes
@@ -423,7 +492,11 @@ struct Regs {
 
     BitField<8, 2, TriangleTopology> triangle_topology;
 
-    INSERT_PADDING_WORDS(0x5b);
+    INSERT_PADDING_WORDS(0x51);
+
+    BitField<0, 16, u32> vs_bool_uniforms;
+
+    INSERT_PADDING_WORDS(0x9);
 
     // Offset to shader program entry point (in words)
     BitField<0, 16, u32> vs_main_offset;
@@ -517,26 +590,27 @@ struct Regs {
     static std::string GetCommandName(int index) {
         std::map<u32, std::string> map;
 
-        // TODO: MSVC does not support using offsetof() on non-static data members even though this
-        //       is technically allowed since C++11. Hence, this functionality is disabled until
-        //       MSVC properly supports it.
-        #ifndef _MSC_VER
-        Regs regs;
         #define ADD_FIELD(name)                                                                               \
             do {                                                                                              \
                 map.insert({PICA_REG_INDEX(name), #name});                                                    \
-                for (u32 i = PICA_REG_INDEX(name) + 1; i < PICA_REG_INDEX(name) + sizeof(regs.name) / 4; ++i) \
+                /* TODO: change to Regs::name when VS2015 and other compilers support it  */                   \
+                for (u32 i = PICA_REG_INDEX(name) + 1; i < PICA_REG_INDEX(name) + sizeof(Regs().name) / 4; ++i) \
                     map.insert({i, #name + std::string("+") + std::to_string(i-PICA_REG_INDEX(name))});       \
             } while(false)
 
+        ADD_FIELD(trigger_irq);
         ADD_FIELD(viewport_size_x);
         ADD_FIELD(viewport_size_y);
         ADD_FIELD(viewport_depth_range);
         ADD_FIELD(viewport_depth_far_plane);
         ADD_FIELD(viewport_corner);
-        ADD_FIELD(texturing_enable);
+        ADD_FIELD(texture0_enable);
         ADD_FIELD(texture0);
         ADD_FIELD(texture0_format);
+        ADD_FIELD(texture1);
+        ADD_FIELD(texture1_format);
+        ADD_FIELD(texture2);
+        ADD_FIELD(texture2_format);
         ADD_FIELD(tev_stage0);
         ADD_FIELD(tev_stage1);
         ADD_FIELD(tev_stage2);
@@ -550,6 +624,7 @@ struct Regs {
         ADD_FIELD(trigger_draw);
         ADD_FIELD(trigger_draw_indexed);
         ADD_FIELD(triangle_topology);
+        ADD_FIELD(vs_bool_uniforms);
         ADD_FIELD(vs_main_offset);
         ADD_FIELD(vs_input_register_map);
         ADD_FIELD(vs_uniform_setup);
@@ -557,7 +632,6 @@ struct Regs {
         ADD_FIELD(vs_swizzle_patterns);
 
         #undef ADD_FIELD
-        #endif // _MSC_VER
 
         // Return empty string if no match is found
         return map[index];
@@ -593,6 +667,7 @@ private:
 #ifndef _MSC_VER
 #define ASSERT_REG_POSITION(field_name, position) static_assert(offsetof(Regs, field_name) == position * 4, "Field "#field_name" has invalid position")
 
+ASSERT_REG_POSITION(trigger_irq, 0x10);
 ASSERT_REG_POSITION(viewport_size_x, 0x41);
 ASSERT_REG_POSITION(viewport_size_y, 0x43);
 ASSERT_REG_POSITION(viewport_depth_range, 0x4d);
@@ -600,9 +675,13 @@ ASSERT_REG_POSITION(viewport_depth_far_plane, 0x4e);
 ASSERT_REG_POSITION(vs_output_attributes[0], 0x50);
 ASSERT_REG_POSITION(vs_output_attributes[1], 0x51);
 ASSERT_REG_POSITION(viewport_corner, 0x68);
-ASSERT_REG_POSITION(texturing_enable, 0x80);
+ASSERT_REG_POSITION(texture0_enable, 0x80);
 ASSERT_REG_POSITION(texture0, 0x81);
 ASSERT_REG_POSITION(texture0_format, 0x8e);
+ASSERT_REG_POSITION(texture1, 0x91);
+ASSERT_REG_POSITION(texture1_format, 0x96);
+ASSERT_REG_POSITION(texture2, 0x99);
+ASSERT_REG_POSITION(texture2_format, 0x9e);
 ASSERT_REG_POSITION(tev_stage0, 0xc0);
 ASSERT_REG_POSITION(tev_stage1, 0xc8);
 ASSERT_REG_POSITION(tev_stage2, 0xd0);
@@ -616,6 +695,7 @@ ASSERT_REG_POSITION(num_vertices, 0x228);
 ASSERT_REG_POSITION(trigger_draw, 0x22e);
 ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
 ASSERT_REG_POSITION(triangle_topology, 0x25e);
+ASSERT_REG_POSITION(vs_bool_uniforms, 0x2b0);
 ASSERT_REG_POSITION(vs_main_offset, 0x2ba);
 ASSERT_REG_POSITION(vs_input_register_map, 0x2bb);
 ASSERT_REG_POSITION(vs_uniform_setup, 0x2c0);
@@ -677,6 +757,26 @@ struct float24 {
         return float24::FromFloat32(ToFloat32() - flt.ToFloat32());
     }
 
+    float24& operator *= (const float24& flt) {
+        value *= flt.ToFloat32();
+        return *this;
+    }
+
+    float24& operator /= (const float24& flt) {
+        value /= flt.ToFloat32();
+        return *this;
+    }
+
+    float24& operator += (const float24& flt) {
+        value += flt.ToFloat32();
+        return *this;
+    }
+
+    float24& operator -= (const float24& flt) {
+        value -= flt.ToFloat32();
+        return *this;
+    }
+
     float24 operator - () const {
         return float24::FromFloat32(-ToFloat32());
     }
@@ -697,6 +797,14 @@ struct float24 {
         return ToFloat32() <= flt.ToFloat32();
     }
 
+    bool operator == (const float24& flt) const {
+        return ToFloat32() == flt.ToFloat32();
+    }
+
+    bool operator != (const float24& flt) const {
+        return ToFloat32() != flt.ToFloat32();
+    }
+
 private:
     // Stored as a regular float, merely for convenience
     // TODO: Perform proper arithmetic on this!
@@ -714,5 +822,15 @@ union CommandHeader {
     BitField<31,  1, u32> group_commands;
 };
 
+// TODO: Ugly, should fix PhysicalToVirtualAddress instead
+inline static u32 PAddrToVAddr(u32 addr) {
+    if (addr >= Memory::VRAM_PADDR && addr < Memory::VRAM_PADDR + Memory::VRAM_SIZE) {
+        return addr - Memory::VRAM_PADDR + Memory::VRAM_VADDR;
+    } else if (addr >= Memory::FCRAM_PADDR && addr < Memory::FCRAM_PADDR + Memory::FCRAM_SIZE) {
+        return addr - Memory::FCRAM_PADDR + Memory::HEAP_LINEAR_VADDR;
+    } else {
+        return 0;
+    }
+}
 
 } // namespace
diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
index dabf2d1a3..242a07e26 100644
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #include "pica.h"
@@ -30,20 +30,27 @@ void PrimitiveAssembler<VertexType>::SubmitVertex(VertexType& vtx, TriangleHandl
             }
             break;
 
+        case Regs::TriangleTopology::Strip:
         case Regs::TriangleTopology::Fan:
-            if (buffer_index == 2) {
-                buffer_index = 0;
-
-                triangle_handler(buffer[0], buffer[1], vtx);
+            if (strip_ready) {
+                // TODO: Should be "buffer[0], buffer[1], vtx" instead!
+                // Not quite sure why we need this order for things to show up properly.
+                // Maybe a bug in the rasterizer?
+                triangle_handler(buffer[1], buffer[0], vtx);
+            }
+            buffer[buffer_index] = vtx;
 
-                buffer[1] = vtx;
-            } else {
-                buffer[buffer_index++] = vtx;
+            if (topology == Regs::TriangleTopology::Strip) {
+                strip_ready |= (buffer_index == 1);
+                buffer_index = !buffer_index;
+            } else if (topology == Regs::TriangleTopology::Fan) {
+                buffer_index = 1;
+                strip_ready = true;
             }
             break;
 
         default:
-            ERROR_LOG(GPU, "Unknown triangle topology %x:", (int)topology);
+            LOG_ERROR(HW_GPU, "Unknown triangle topology %x:", (int)topology);
             break;
     }
 }
diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h
index ea2e2f61e..52ff4cd89 100644
--- a/src/video_core/primitive_assembly.h
+++ b/src/video_core/primitive_assembly.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
@@ -37,6 +37,7 @@ private:
 
     int buffer_index;
     VertexType buffer[2];
+    bool strip_ready = false;
 };
 
 
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index a35f0c0d8..a80148872 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #include <algorithm>
@@ -18,7 +18,7 @@ namespace Pica {
 namespace Rasterizer {
 
 static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
-    u32* color_buffer = (u32*)Memory::GetPointer(registers.framebuffer.GetColorBufferAddress());
+    u32* color_buffer = reinterpret_cast<u32*>(Memory::GetPointer(PAddrToVAddr(registers.framebuffer.GetColorBufferPhysicalAddress())));
     u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
 
     // Assuming RGBA8 format until actual framebuffer format handling is implemented
@@ -26,14 +26,14 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
 }
 
 static u32 GetDepth(int x, int y) {
-    u16* depth_buffer = (u16*)Memory::GetPointer(registers.framebuffer.GetDepthBufferAddress());
+    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(registers.framebuffer.GetDepthBufferPhysicalAddress())));
 
     // Assuming 16-bit depth buffer format until actual format handling is implemented
     return *(depth_buffer + x + y * registers.framebuffer.GetWidth());
 }
 
 static void SetDepth(int x, int y, u16 value) {
-    u16* depth_buffer = (u16*)Memory::GetPointer(registers.framebuffer.GetDepthBufferAddress());
+    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(registers.framebuffer.GetDepthBufferPhysicalAddress())));
 
     // Assuming 16-bit depth buffer format until actual format handling is implemented
     *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;
@@ -106,6 +106,11 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
     int bias1 = IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0;
     int bias2 = IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0;
 
+    auto w_inverse = Math::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w);
+
+    auto textures = registers.GetTextures();
+    auto tev_stages = registers.GetTevStages();
+
     // TODO: Not sure if looping through x first might be faster
     for (u16 y = min_y; y < max_y; y += 0x10) {
         for (u16 x = min_x; x < max_x; x += 0x10) {
@@ -129,6 +134,11 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
             if (w0 < 0 || w1 < 0 || w2 < 0)
                 continue;
 
+            auto baricentric_coordinates = Math::MakeVec(float24::FromFloat32(static_cast<float>(w0)),
+                                                float24::FromFloat32(static_cast<float>(w1)),
+                                                float24::FromFloat32(static_cast<float>(w2)));
+            float24 interpolated_w_inverse = float24::FromFloat32(1.0f) / Math::Dot(w_inverse, baricentric_coordinates);
+
             // Perspective correct attribute interpolation:
             // Attribute values cannot be calculated by simple linear interpolation since
             // they are not linear in screen space. For example, when interpolating a
@@ -145,19 +155,9 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
             //
             // The generalization to three vertices is straightforward in baricentric coordinates.
             auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) {
-                auto attr_over_w = Math::MakeVec(attr0 / v0.pos.w,
-                                                 attr1 / v1.pos.w,
-                                                 attr2 / v2.pos.w);
-                auto w_inverse   = Math::MakeVec(float24::FromFloat32(1.f) / v0.pos.w,
-                                                 float24::FromFloat32(1.f) / v1.pos.w,
-                                                 float24::FromFloat32(1.f) / v2.pos.w);
-                auto baricentric_coordinates = Math::MakeVec(float24::FromFloat32(static_cast<float>(w0)),
-                                                             float24::FromFloat32(static_cast<float>(w1)),
-                                                             float24::FromFloat32(static_cast<float>(w2)));
-
+                auto attr_over_w = Math::MakeVec(attr0, attr1, attr2);
                 float24 interpolated_attr_over_w = Math::Dot(attr_over_w, baricentric_coordinates);
-                float24 interpolated_w_inverse   = Math::Dot(w_inverse,   baricentric_coordinates);
-                return interpolated_attr_over_w / interpolated_w_inverse;
+                return interpolated_attr_over_w * interpolated_w_inverse;
             };
 
             Math::Vec4<u8> primary_color{
@@ -167,60 +167,48 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                 (u8)(GetInterpolatedAttribute(v0.color.a(), v1.color.a(), v2.color.a()).ToFloat32() * 255)
             };
 
-            Math::Vec4<u8> texture_color{};
-            float24 u = GetInterpolatedAttribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
-            float24 v = GetInterpolatedAttribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
-            if (registers.texturing_enable) {
-                // Images are split into 8x8 tiles. Each tile is composed of four 4x4 subtiles each
-                // of which is composed of four 2x2 subtiles each of which is composed of four texels.
-                // Each structure is embedded into the next-bigger one in a diagonal pattern, e.g.
-                // texels are laid out in a 2x2 subtile like this:
-                // 2 3
-                // 0 1
-                //
-                // The full 8x8 tile has the texels arranged like this:
-                //
-                // 42 43 46 47 58 59 62 63
-                // 40 41 44 45 56 57 60 61
-                // 34 35 38 39 50 51 54 55
-                // 32 33 36 37 48 49 52 53
-                // 10 11 14 15 26 27 30 31
-                // 08 09 12 13 24 25 28 29
-                // 02 03 06 07 18 19 22 23
-                // 00 01 04 05 16 17 20 21
-
-                // TODO: This is currently hardcoded for RGB8
-                u32* texture_data = (u32*)Memory::GetPointer(registers.texture0.GetPhysicalAddress());
-
-                // TODO(neobrain): Not sure if this swizzling pattern is used for all textures.
-                // To be flexible in case different but similar patterns are used, we keep this
-                // somewhat inefficient code around for now.
-                int s = (int)(u * float24::FromFloat32(static_cast<float>(registers.texture0.width))).ToFloat32();
-                int t = (int)(v * float24::FromFloat32(static_cast<float>(registers.texture0.height))).ToFloat32();
-                int texel_index_within_tile = 0;
-                for (int block_size_index = 0; block_size_index < 3; ++block_size_index) {
-                    int sub_tile_width = 1 << block_size_index;
-                    int sub_tile_height = 1 << block_size_index;
-
-                    int sub_tile_index = (s & sub_tile_width) << block_size_index;
-                    sub_tile_index += 2 * ((t & sub_tile_height) << block_size_index);
-                    texel_index_within_tile += sub_tile_index;
-                }
-
-                const int block_width = 8;
-                const int block_height = 8;
-
-                int coarse_s = (s / block_width) * block_width;
-                int coarse_t = (t / block_height) * block_height;
-
-                const int row_stride = registers.texture0.width * 3;
-                u8* source_ptr = (u8*)texture_data + coarse_s * block_height * 3 + coarse_t * row_stride + texel_index_within_tile * 3;
-                texture_color.r() = source_ptr[2];
-                texture_color.g() = source_ptr[1];
-                texture_color.b() = source_ptr[0];
-                texture_color.a() = 0xFF;
-
-                DebugUtils::DumpTexture(registers.texture0, (u8*)texture_data);
+            Math::Vec2<float24> uv[3];
+            uv[0].u() = GetInterpolatedAttribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
+            uv[0].v() = GetInterpolatedAttribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
+            uv[1].u() = GetInterpolatedAttribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u());
+            uv[1].v() = GetInterpolatedAttribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v());
+            uv[2].u() = GetInterpolatedAttribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u());
+            uv[2].v() = GetInterpolatedAttribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v());
+
+            Math::Vec4<u8> texture_color[3]{};
+            for (int i = 0; i < 3; ++i) {
+                const auto& texture = textures[i];
+                if (!texture.enabled)
+                    continue;
+
+                _dbg_assert_(HW_GPU, 0 != texture.config.address);
+
+                int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
+                int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
+                auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
+                    switch (mode) {
+                        case Regs::TextureConfig::ClampToEdge:
+                            val = std::max(val, 0);
+                            val = std::min(val, (int)size - 1);
+                            return val;
+
+                        case Regs::TextureConfig::Repeat:
+                            return (int)(((unsigned)val) % size);
+
+                        default:
+                            LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode %x\n", (int)mode);
+                            _dbg_assert_(HW_GPU, 0);
+                            return 0;
+                    }
+                };
+                s = GetWrappedTexCoord(registers.texture0.wrap_s, s, registers.texture0.width);
+                t = GetWrappedTexCoord(registers.texture0.wrap_t, t, registers.texture0.height);
+
+                u8* texture_data = Memory::GetPointer(PAddrToVAddr(texture.config.GetPhysicalAddress()));
+                auto info = DebugUtils::TextureInfo::FromPicaRegister(texture.config, texture.format);
+
+                texture_color[i] = DebugUtils::LookupTexture(texture_data, s, t, info);
+                DebugUtils::DumpTexture(texture.config, texture_data);
             }
 
             // Texture environment - consists of 6 stages of color and alpha combining.
@@ -231,28 +219,35 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
             // with some basic arithmetic. Alpha combiners can be configured separately but work
             // analogously.
             Math::Vec4<u8> combiner_output;
-            for (auto tev_stage : registers.GetTevStages()) {
+            for (const auto& tev_stage : tev_stages) {
                 using Source = Regs::TevStageConfig::Source;
                 using ColorModifier = Regs::TevStageConfig::ColorModifier;
                 using AlphaModifier = Regs::TevStageConfig::AlphaModifier;
                 using Operation = Regs::TevStageConfig::Operation;
 
-                auto GetColorSource = [&](Source source) -> Math::Vec3<u8> {
+                auto GetColorSource = [&](Source source) -> Math::Vec4<u8> {
                     switch (source) {
                     case Source::PrimaryColor:
-                        return primary_color.rgb();
+                        return primary_color;
 
                     case Source::Texture0:
-                        return texture_color.rgb();
+                        return texture_color[0];
+
+                    case Source::Texture1:
+                        return texture_color[1];
+
+                    case Source::Texture2:
+                        return texture_color[2];
 
                     case Source::Constant:
-                        return {tev_stage.const_r, tev_stage.const_g, tev_stage.const_b};
+                        return {tev_stage.const_r, tev_stage.const_g, tev_stage.const_b, tev_stage.const_a};
 
                     case Source::Previous:
-                        return combiner_output.rgb();
+                        return combiner_output;
 
                     default:
-                        ERROR_LOG(GPU, "Unknown color combiner source %d\n", (int)source);
+                        LOG_ERROR(HW_GPU, "Unknown color combiner source %d\n", (int)source);
+                        _dbg_assert_(HW_GPU, 0);
                         return {};
                     }
                 };
@@ -263,7 +258,13 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                         return primary_color.a();
 
                     case Source::Texture0:
-                        return texture_color.a();
+                        return texture_color[0].a();
+
+                    case Source::Texture1:
+                        return texture_color[1].a();
+
+                    case Source::Texture2:
+                        return texture_color[2].a();
 
                     case Source::Constant:
                         return tev_stage.const_a;
@@ -272,18 +273,24 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                         return combiner_output.a();
 
                     default:
-                        ERROR_LOG(GPU, "Unknown alpha combiner source %d\n", (int)source);
+                        LOG_ERROR(HW_GPU, "Unknown alpha combiner source %d\n", (int)source);
+                        _dbg_assert_(HW_GPU, 0);
                         return 0;
                     }
                 };
 
-                auto GetColorModifier = [](ColorModifier factor, const Math::Vec3<u8>& values) -> Math::Vec3<u8> {
+                auto GetColorModifier = [](ColorModifier factor, const Math::Vec4<u8>& values) -> Math::Vec3<u8> {
                     switch (factor)
                     {
                     case ColorModifier::SourceColor:
-                        return values;
+                        return values.rgb();
+
+                    case ColorModifier::SourceAlpha:
+                        return { values.a(), values.a(), values.a() };
+
                     default:
-                        ERROR_LOG(GPU, "Unknown color factor %d\n", (int)factor);
+                        LOG_ERROR(HW_GPU, "Unknown color factor %d\n", (int)factor);
+                        _dbg_assert_(HW_GPU, 0);
                         return {};
                     }
                 };
@@ -292,8 +299,13 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     switch (factor) {
                     case AlphaModifier::SourceAlpha:
                         return value;
+
+                    case AlphaModifier::OneMinusSourceAlpha:
+                        return 255 - value;
+
                     default:
-                        ERROR_LOG(GPU, "Unknown color factor %d\n", (int)factor);
+                        LOG_ERROR(HW_GPU, "Unknown alpha factor %d\n", (int)factor);
+                        _dbg_assert_(HW_GPU, 0);
                         return 0;
                     }
                 };
@@ -306,8 +318,21 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     case Operation::Modulate:
                         return ((input[0] * input[1]) / 255).Cast<u8>();
 
+                    case Operation::Add:
+                    {
+                        auto result = input[0] + input[1];
+                        result.r() = std::min(255, result.r());
+                        result.g() = std::min(255, result.g());
+                        result.b() = std::min(255, result.b());
+                        return result.Cast<u8>();
+                    }
+
+                    case Operation::Lerp:
+                        return ((input[0] * input[2] + input[1] * (Math::MakeVec<u8>(255, 255, 255) - input[2]).Cast<u8>()) / 255).Cast<u8>();
+
                     default:
-                        ERROR_LOG(GPU, "Unknown color combiner operation %d\n", (int)op);
+                        LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
+                        _dbg_assert_(HW_GPU, 0);
                         return {};
                     }
                 };
@@ -320,8 +345,15 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0,
                     case Operation::Modulate:
                         return input[0] * input[1] / 255;
 
+                    case Operation::Add:
+                        return std::min(255, input[0] + input[1]);
+
+                    case Operation::Lerp:
+                        return (input[0] * input[2] + input[1] * (255 - input[2])) / 255;
+
                     default:
-                        ERROR_LOG(GPU, "Unknown alpha combiner operation %d\n", (int)op);
+                        LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op);
+                        _dbg_assert_(HW_GPU, 0);
                         return 0;
                     }
                 };
diff --git a/src/video_core/rasterizer.h b/src/video_core/rasterizer.h
index 500be9462..42148f8b1 100644
--- a/src/video_core/rasterizer.h
+++ b/src/video_core/rasterizer.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h
index bce402b88..b77f29c11 100644
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp
index a0eb0418c..e982e3746 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_util.cpp
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #include "gl_shader_util.h"
@@ -20,9 +20,9 @@ GLuint LoadShaders(const char* vertex_shader, const char* fragment_shader) {
     int info_log_length;
 
     // Compile Vertex Shader
-    DEBUG_LOG(GPU, "Compiling vertex shader.");
+    LOG_DEBUG(Render_OpenGL, "Compiling vertex shader...");
 
-    glShaderSource(vertex_shader_id, 1, &vertex_shader, NULL);
+    glShaderSource(vertex_shader_id, 1, &vertex_shader, nullptr);
     glCompileShader(vertex_shader_id);
 
     // Check Vertex Shader
@@ -31,14 +31,18 @@ GLuint LoadShaders(const char* vertex_shader, const char* fragment_shader) {
 
     if (info_log_length > 1) {
         std::vector<char> vertex_shader_error(info_log_length);
-        glGetShaderInfoLog(vertex_shader_id, info_log_length, NULL, &vertex_shader_error[0]);
-        DEBUG_LOG(GPU, "%s", &vertex_shader_error[0]);
+        glGetShaderInfoLog(vertex_shader_id, info_log_length, nullptr, &vertex_shader_error[0]);
+        if (result) {
+            LOG_DEBUG(Render_OpenGL, "%s", &vertex_shader_error[0]);
+        } else {
+            LOG_ERROR(Render_OpenGL, "Error compiling vertex shader:\n%s", &vertex_shader_error[0]);
+        }
     }
 
     // Compile Fragment Shader
-    DEBUG_LOG(GPU, "Compiling fragment shader.");
+    LOG_DEBUG(Render_OpenGL, "Compiling fragment shader...");
 
-    glShaderSource(fragment_shader_id, 1, &fragment_shader, NULL);
+    glShaderSource(fragment_shader_id, 1, &fragment_shader, nullptr);
     glCompileShader(fragment_shader_id);
 
     // Check Fragment Shader
@@ -47,12 +51,16 @@ GLuint LoadShaders(const char* vertex_shader, const char* fragment_shader) {
 
     if (info_log_length > 1) {
         std::vector<char> fragment_shader_error(info_log_length);
-        glGetShaderInfoLog(fragment_shader_id, info_log_length, NULL, &fragment_shader_error[0]);
-        DEBUG_LOG(GPU, "%s", &fragment_shader_error[0]);
+        glGetShaderInfoLog(fragment_shader_id, info_log_length, nullptr, &fragment_shader_error[0]);
+        if (result) {
+            LOG_DEBUG(Render_OpenGL, "%s", &fragment_shader_error[0]);
+        } else {
+            LOG_ERROR(Render_OpenGL, "Error compiling fragment shader:\n%s", &fragment_shader_error[0]);
+        }
     }
 
     // Link the program
-    DEBUG_LOG(GPU, "Linking program.");
+    LOG_DEBUG(Render_OpenGL, "Linking program...");
 
     GLuint program_id = glCreateProgram();
     glAttachShader(program_id, vertex_shader_id);
@@ -65,8 +73,12 @@ GLuint LoadShaders(const char* vertex_shader, const char* fragment_shader) {
 
     if (info_log_length > 1) {
         std::vector<char> program_error(info_log_length);
-        glGetProgramInfoLog(program_id, info_log_length, NULL, &program_error[0]);
-        DEBUG_LOG(GPU, "%s", &program_error[0]);
+        glGetProgramInfoLog(program_id, info_log_length, nullptr, &program_error[0]);
+        if (result) {
+            LOG_DEBUG(Render_OpenGL, "%s", &program_error[0]);
+        } else {
+            LOG_ERROR(Render_OpenGL, "Error linking shader:\n%s", &program_error[0]);
+        }
     }
 
     glDeleteShader(vertex_shader_id);
diff --git a/src/video_core/renderer_opengl/gl_shader_util.h b/src/video_core/renderer_opengl/gl_shader_util.h
index 986cbabc0..9b93a8a0c 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.h
+++ b/src/video_core/renderer_opengl/gl_shader_util.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
diff --git a/src/video_core/renderer_opengl/gl_shaders.h b/src/video_core/renderer_opengl/gl_shaders.h
index 0f88ab802..746a37afe 100644
--- a/src/video_core/renderer_opengl/gl_shaders.h
+++ b/src/video_core/renderer_opengl/gl_shaders.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index abbb4c2cb..4df3a5e25 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #include "core/hw/gpu.h"
@@ -61,7 +61,7 @@ void RendererOpenGL::SwapBuffers() {
     for(int i : {0, 1}) {
         const auto& framebuffer = GPU::g_regs.framebuffer_config[i];
 
-        if (textures[i].width != framebuffer.width || textures[i].height != framebuffer.height) {
+        if (textures[i].width != (GLsizei)framebuffer.width || textures[i].height != (GLsizei)framebuffer.height) {
             // Reallocate texture if the framebuffer size has changed.
             // This is expected to not happen very often and hence should not be a
             // performance problem.
@@ -90,7 +90,7 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig&
     const VAddr framebuffer_vaddr = Memory::PhysicalToVirtualAddress(
         framebuffer.active_fb == 1 ? framebuffer.address_left2 : framebuffer.address_left1);
 
-    DEBUG_LOG(GPU, "0x%08x bytes from 0x%08x(%dx%d), fmt %x",
+    LOG_TRACE(Render_OpenGL, "0x%08x bytes from 0x%08x(%dx%d), fmt %x",
         framebuffer.stride * framebuffer.height,
         framebuffer_vaddr, (int)framebuffer.width,
         (int)framebuffer.height, (int)framebuffer.format);
@@ -98,15 +98,15 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig&
     const u8* framebuffer_data = Memory::GetPointer(framebuffer_vaddr);
 
     // TODO: Handle other pixel formats
-    _dbg_assert_msg_(RENDER, framebuffer.color_format == GPU::Regs::PixelFormat::RGB8,
+    _dbg_assert_msg_(Render_OpenGL, framebuffer.color_format == GPU::Regs::PixelFormat::RGB8,
                      "Unsupported 3DS pixel format.");
 
     size_t pixel_stride = framebuffer.stride / 3;
     // OpenGL only supports specifying a stride in units of pixels, not bytes, unfortunately
-    _dbg_assert_(RENDER, pixel_stride * 3 == framebuffer.stride);
+    _dbg_assert_(Render_OpenGL, pixel_stride * 3 == framebuffer.stride);
     // Ensure no bad interactions with GL_UNPACK_ALIGNMENT, which by default
     // only allows rows to have a memory alignement of 4.
-    _dbg_assert_(RENDER, pixel_stride % 4 == 0);
+    _dbg_assert_(Render_OpenGL, pixel_stride % 4 == 0);
 
     glBindTexture(GL_TEXTURE_2D, texture.handle);
     glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint)pixel_stride);
@@ -240,14 +240,14 @@ MathUtil::Rectangle<unsigned> RendererOpenGL::GetViewportExtent() {
     MathUtil::Rectangle<unsigned> viewport_extent;
     if (window_aspect_ratio > emulation_aspect_ratio) {
         // Window is narrower than the emulation content => apply borders to the top and bottom
-        unsigned viewport_height = emulation_aspect_ratio * framebuffer_width;
+        unsigned viewport_height = static_cast<unsigned>(std::round(emulation_aspect_ratio * framebuffer_width));
         viewport_extent.left = 0;
         viewport_extent.top = (framebuffer_height - viewport_height) / 2;
         viewport_extent.right = viewport_extent.left + framebuffer_width;
         viewport_extent.bottom = viewport_extent.top + viewport_height;
     } else {
         // Otherwise, apply borders to the left and right sides of the window.
-        unsigned viewport_width = framebuffer_height / emulation_aspect_ratio;
+        unsigned viewport_width = static_cast<unsigned>(std::round(framebuffer_height / emulation_aspect_ratio));
         viewport_extent.left = (framebuffer_width - viewport_width) / 2;
         viewport_extent.top = 0;
         viewport_extent.right = viewport_extent.left + viewport_width;
@@ -263,11 +263,11 @@ void RendererOpenGL::Init() {
 
     int err = ogl_LoadFunctions();
     if (ogl_LOAD_SUCCEEDED != err) {
-        ERROR_LOG(RENDER, "Failed to initialize GL functions! Exiting...");
+        LOG_CRITICAL(Render_OpenGL, "Failed to initialize GL functions! Exiting...");
         exit(-1);
     }
 
-    NOTICE_LOG(RENDER, "GL_VERSION: %s\n", glGetString(GL_VERSION));
+    LOG_INFO(Render_OpenGL, "GL_VERSION: %s", glGetString(GL_VERSION));
     InitOpenGLObjects();
 }
 
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 7fdcec731..cf78c1e77 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
diff --git a/src/video_core/utils.cpp b/src/video_core/utils.cpp
index f1156a493..c7cc93cea 100644
--- a/src/video_core/utils.cpp
+++ b/src/video_core/utils.cpp
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #include <stdio.h>
diff --git a/src/video_core/utils.h b/src/video_core/utils.h
index 21380a908..63ebccbde 100644
--- a/src/video_core/utils.h
+++ b/src/video_core/utils.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
index 96625791c..bed5081a0 100644
--- a/src/video_core/vertex_shader.cpp
+++ b/src/video_core/vertex_shader.cpp
@@ -1,12 +1,26 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <stack>
+
+#include <boost/range/algorithm.hpp>
+
+#include <common/file_util.h>
+
+#include <core/mem_map.h>
+
+#include <nihstro/shader_bytecode.h>
+
+
 #include "pica.h"
 #include "vertex_shader.h"
 #include "debug_utils/debug_utils.h"
-#include <core/mem_map.h>
-#include <common/file_util.h>
+
+using nihstro::Instruction;
+using nihstro::RegisterType;
+using nihstro::SourceRegister;
+using nihstro::SwizzlePattern;
 
 namespace Pica {
 
@@ -14,13 +28,14 @@ namespace VertexShader {
 
 static struct {
     Math::Vec4<float24> f[96];
-} shader_uniforms;
 
+    std::array<bool,16> b;
+} shader_uniforms;
 
 // TODO: Not sure where the shader binary and swizzle patterns are supposed to be loaded to!
 // For now, we just keep these local arrays around.
-static u32 shader_memory[1024];
-static u32 swizzle_data[1024];
+static std::array<u32, 1024> shader_memory;
+static std::array<u32, 1024> swizzle_data;
 
 void SubmitShaderMemoryChange(u32 addr, u32 value)
 {
@@ -37,6 +52,21 @@ Math::Vec4<float24>& GetFloatUniform(u32 index)
     return shader_uniforms.f[index];
 }
 
+bool& GetBoolUniform(u32 index)
+{
+    return shader_uniforms.b[index];
+}
+
+const std::array<u32, 1024>& GetShaderBinary()
+{
+    return shader_memory;
+}
+
+const std::array<u32, 1024>& GetSwizzlePatterns()
+{
+    return swizzle_data;
+}
+
 struct VertexShaderState {
     u32* program_counter;
 
@@ -44,13 +74,23 @@ struct VertexShaderState {
     float24* output_register_table[7*4];
 
     Math::Vec4<float24> temporary_registers[16];
-    bool status_registers[2];
+    bool conditional_code[2];
+
+    // Two Address registers and one loop counter
+    // TODO: How many bits do these actually have?
+    s32 address_registers[3];
 
     enum {
         INVALID_ADDRESS = 0xFFFFFFFF
     };
-    u32 call_stack[8]; // TODO: What is the maximal call stack depth?
-    u32* call_stack_pointer;
+
+    struct CallStackElement {
+        u32 final_address;
+        u32 return_address;
+    };
+
+    // TODO: Is there a maximal size for this?
+    std::stack<CallStackElement> call_stack;
 
     struct {
         u32 max_offset; // maximum program counter ever reached
@@ -59,49 +99,105 @@ struct VertexShaderState {
 };
 
 static void ProcessShaderCode(VertexShaderState& state) {
+
+    // Placeholder for invalid inputs
+    static float24 dummy_vec4_float24[4];
+
     while (true) {
-        bool increment_pc = true;
+        if (!state.call_stack.empty()) {
+            if (state.program_counter - shader_memory.data() == state.call_stack.top().final_address) {
+                state.program_counter = &shader_memory[state.call_stack.top().return_address];
+                state.call_stack.pop();
+
+                // TODO: Is "trying again" accurate to hardware?
+                continue;
+            }
+        }
+
         bool exit_loop = false;
         const Instruction& instr = *(const Instruction*)state.program_counter;
-        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + (state.program_counter - shader_memory));
-
-        const float24* src1_ = (instr.common.src1 < 0x10) ? state.input_register_table[instr.common.src1.GetIndex()]
-                             : (instr.common.src1 < 0x20) ? &state.temporary_registers[instr.common.src1.GetIndex()].x
-                             : (instr.common.src1 < 0x80) ? &shader_uniforms.f[instr.common.src1.GetIndex()].x
-                             : nullptr;
-        const float24* src2_ = (instr.common.src2 < 0x10) ? state.input_register_table[instr.common.src2.GetIndex()]
-                             : &state.temporary_registers[instr.common.src2.GetIndex()].x;
-        float24* dest = (instr.common.dest < 0x08) ? state.output_register_table[4*instr.common.dest.GetIndex()]
-                      : (instr.common.dest < 0x10) ? nullptr
-                      : (instr.common.dest < 0x20) ? &state.temporary_registers[instr.common.dest.GetIndex()][0]
-                      : nullptr;
-
         const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
-        const bool negate_src1 = (swizzle.negate != 0);
 
-        float24 src1[4] = {
-            src1_[(int)swizzle.GetSelectorSrc1(0)],
-            src1_[(int)swizzle.GetSelectorSrc1(1)],
-            src1_[(int)swizzle.GetSelectorSrc1(2)],
-            src1_[(int)swizzle.GetSelectorSrc1(3)],
+        auto call = [&](VertexShaderState& state, u32 offset, u32 num_instructions, u32 return_offset) {
+            state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
+            state.call_stack.push({ offset + num_instructions, return_offset });
         };
-        if (negate_src1) {
-            src1[0] = src1[0] * float24::FromFloat32(-1);
-            src1[1] = src1[1] * float24::FromFloat32(-1);
-            src1[2] = src1[2] * float24::FromFloat32(-1);
-            src1[3] = src1[3] * float24::FromFloat32(-1);
-        }
-        const float24 src2[4] = {
-            src2_[(int)swizzle.GetSelectorSrc2(0)],
-            src2_[(int)swizzle.GetSelectorSrc2(1)],
-            src2_[(int)swizzle.GetSelectorSrc2(2)],
-            src2_[(int)swizzle.GetSelectorSrc2(3)],
+        u32 binary_offset = state.program_counter - shader_memory.data();
+
+        state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + binary_offset);
+
+        auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
+            switch (source_reg.GetRegisterType()) {
+            case RegisterType::Input:
+                return state.input_register_table[source_reg.GetIndex()];
+
+            case RegisterType::Temporary:
+                return &state.temporary_registers[source_reg.GetIndex()].x;
+
+            case RegisterType::FloatUniform:
+                return &shader_uniforms.f[source_reg.GetIndex()].x;
+
+            default:
+                return dummy_vec4_float24;
+            }
         };
 
-        switch (instr.opcode) {
+        switch (instr.opcode.GetInfo().type) {
+        case Instruction::OpCodeType::Arithmetic:
+        {
+            bool is_inverted = 0 != (instr.opcode.GetInfo().subtype & Instruction::OpCodeInfo::SrcInversed);
+            if (is_inverted) {
+                // TODO: We don't really support this properly: For instance, the address register
+                //       offset needs to be applied to SRC2 instead, etc.
+                //       For now, we just abort in this situation.
+                LOG_CRITICAL(HW_GPU, "Bad condition...");
+                exit(0);
+            }
+
+            const int address_offset = (instr.common.address_register_index == 0)
+                                       ? 0 : state.address_registers[instr.common.address_register_index - 1];
+
+            const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) + address_offset);
+            const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted));
+
+            const bool negate_src1 = ((bool)swizzle.negate_src1 != false);
+            const bool negate_src2 = ((bool)swizzle.negate_src2 != false);
+
+            float24 src1[4] = {
+                src1_[(int)swizzle.GetSelectorSrc1(0)],
+                src1_[(int)swizzle.GetSelectorSrc1(1)],
+                src1_[(int)swizzle.GetSelectorSrc1(2)],
+                src1_[(int)swizzle.GetSelectorSrc1(3)],
+            };
+            if (negate_src1) {
+                src1[0] = src1[0] * float24::FromFloat32(-1);
+                src1[1] = src1[1] * float24::FromFloat32(-1);
+                src1[2] = src1[2] * float24::FromFloat32(-1);
+                src1[3] = src1[3] * float24::FromFloat32(-1);
+            }
+            float24 src2[4] = {
+                src2_[(int)swizzle.GetSelectorSrc2(0)],
+                src2_[(int)swizzle.GetSelectorSrc2(1)],
+                src2_[(int)swizzle.GetSelectorSrc2(2)],
+                src2_[(int)swizzle.GetSelectorSrc2(3)],
+            };
+            if (negate_src2) {
+                src2[0] = src2[0] * float24::FromFloat32(-1);
+                src2[1] = src2[1] * float24::FromFloat32(-1);
+                src2[2] = src2[2] * float24::FromFloat32(-1);
+                src2[3] = src2[3] * float24::FromFloat32(-1);
+            }
+
+            float24* dest = (instr.common.dest < 0x08) ? state.output_register_table[4*instr.common.dest.GetIndex()]
+                        : (instr.common.dest < 0x10) ? dummy_vec4_float24
+                        : (instr.common.dest < 0x20) ? &state.temporary_registers[instr.common.dest.GetIndex()][0]
+                        : dummy_vec4_float24;
+
+            state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
+
+            switch (instr.opcode.EffectiveOpCode()) {
             case Instruction::OpCode::ADD:
             {
-                state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
@@ -114,7 +210,6 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
             case Instruction::OpCode::MUL:
             {
-                state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
@@ -125,10 +220,18 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 break;
             }
 
+            case Instruction::OpCode::MAX:
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = std::max(src1[i], src2[i]);
+                }
+                break;
+
             case Instruction::OpCode::DP3:
             case Instruction::OpCode::DP4:
             {
-                state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
                 float24 dot = float24::FromFloat32(0.f);
                 int num_components = (instr.opcode == Instruction::OpCode::DP3) ? 3 : 4;
                 for (int i = 0; i < num_components; ++i)
@@ -146,7 +249,6 @@ static void ProcessShaderCode(VertexShaderState& state) {
             // Reciprocal
             case Instruction::OpCode::RCP:
             {
-                state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
@@ -162,7 +264,6 @@ static void ProcessShaderCode(VertexShaderState& state) {
             // Reciprocal Square Root
             case Instruction::OpCode::RSQ:
             {
-                state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
@@ -175,9 +276,21 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 break;
             }
 
+            case Instruction::OpCode::MOVA:
+            {
+                for (int i = 0; i < 2; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    // TODO: Figure out how the rounding is done on hardware
+                    state.address_registers[i] = static_cast<s32>(src1[i].ToFloat32());
+                }
+
+                break;
+            }
+
             case Instruction::OpCode::MOV:
             {
-                state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                         continue;
@@ -187,39 +300,137 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 break;
             }
 
-            case Instruction::OpCode::RET:
-                if (*state.call_stack_pointer == VertexShaderState::INVALID_ADDRESS) {
-                    exit_loop = true;
-                } else {
-                    // Jump back to call stack position, invalidate call stack entry, move up call stack pointer
-                    state.program_counter = &shader_memory[*state.call_stack_pointer];
-                    *state.call_stack_pointer-- = VertexShaderState::INVALID_ADDRESS;
+            case Instruction::OpCode::CMP:
+                for (int i = 0; i < 2; ++i) {
+                    // TODO: Can you restrict to one compare via dest masking?
+
+                    auto compare_op = instr.common.compare_op;
+                    auto op = (i == 0) ? compare_op.x.Value() : compare_op.y.Value();
+
+                    switch (op) {
+                        case compare_op.Equal:
+                            state.conditional_code[i] = (src1[i] == src2[i]);
+                            break;
+
+                        case compare_op.NotEqual:
+                            state.conditional_code[i] = (src1[i] != src2[i]);
+                            break;
+
+                        case compare_op.LessThan:
+                            state.conditional_code[i] = (src1[i] <  src2[i]);
+                            break;
+
+                        case compare_op.LessEqual:
+                            state.conditional_code[i] = (src1[i] <= src2[i]);
+                            break;
+
+                        case compare_op.GreaterThan:
+                            state.conditional_code[i] = (src1[i] >  src2[i]);
+                            break;
+
+                        case compare_op.GreaterEqual:
+                            state.conditional_code[i] = (src1[i] >= src2[i]);
+                            break;
+
+                        default:
+                            LOG_ERROR(HW_GPU, "Unknown compare mode %x", static_cast<int>(op));
+                            break;
+                    }
                 }
+                break;
+
+            default:
+                LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x",
+                          (int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex);
+                _dbg_assert_(HW_GPU, 0);
+                break;
+            }
 
+            break;
+        }
+        default:
+            // Handle each instruction on its own
+            switch (instr.opcode) {
+            case Instruction::OpCode::END:
+                exit_loop = true;
                 break;
 
             case Instruction::OpCode::CALL:
-                increment_pc = false;
+                call(state,
+                     instr.flow_control.dest_offset,
+                     instr.flow_control.num_instructions,
+                     binary_offset + 1);
+                break;
 
-                _dbg_assert_(GPU, state.call_stack_pointer - state.call_stack < sizeof(state.call_stack));
+            case Instruction::OpCode::NOP:
+                break;
+
+            case Instruction::OpCode::IFU:
+                if (shader_uniforms.b[instr.flow_control.bool_uniform_id]) {
+                    call(state,
+                         binary_offset + 1,
+                         instr.flow_control.dest_offset - binary_offset - 1,
+                         instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+                } else {
+                    call(state,
+                         instr.flow_control.dest_offset,
+                         instr.flow_control.num_instructions,
+                         instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+                }
 
-                *++state.call_stack_pointer = state.program_counter - shader_memory;
-                // TODO: Does this offset refer to the beginning of shader memory?
-                state.program_counter = &shader_memory[instr.flow_control.offset_words];
                 break;
 
-            case Instruction::OpCode::FLS:
-                // TODO: Do whatever needs to be done here?
+            case Instruction::OpCode::IFC:
+            {
+                // TODO: Do we need to consider swizzlers here?
+
+                auto flow_control = instr.flow_control;
+                bool results[3] = { (bool)flow_control.refx == state.conditional_code[0],
+                                    (bool)flow_control.refy == state.conditional_code[1] };
+
+                switch (flow_control.op) {
+                case flow_control.Or:
+                    results[2] = results[0] || results[1];
+                    break;
+
+                case flow_control.And:
+                    results[2] = results[0] && results[1];
+                    break;
+
+                case flow_control.JustX:
+                    results[2] = results[0];
+                    break;
+
+                case flow_control.JustY:
+                    results[2] = results[1];
+                    break;
+                }
+
+                if (results[2]) {
+                    call(state,
+                         binary_offset + 1,
+                         instr.flow_control.dest_offset - binary_offset - 1,
+                         instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+                } else {
+                    call(state,
+                         instr.flow_control.dest_offset,
+                         instr.flow_control.num_instructions,
+                         instr.flow_control.dest_offset + instr.flow_control.num_instructions);
+                }
+
                 break;
+            }
 
             default:
-                ERROR_LOG(GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
-                          (int)instr.opcode.Value(), instr.GetOpCodeName().c_str(), instr.hex);
+                LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
+                          (int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex);
                 break;
+            }
+
+            break;
         }
 
-        if (increment_pc)
-            ++state.program_counter;
+        ++state.program_counter;
 
         if (exit_loop)
             break;
@@ -238,7 +449,7 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes)
     // Setup input register table
     const auto& attribute_register_map = registers.vs_input_register_map;
     float24 dummy_register;
-    std::fill(&state.input_register_table[0], &state.input_register_table[16], &dummy_register);
+    boost::fill(state.input_register_table, &dummy_register);
     if(num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x;
     if(num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x;
     if(num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x;
@@ -258,6 +469,10 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes)
 
     // Setup output register table
     OutputVertex ret;
+    // Zero output so that attributes which aren't output won't have denormals in them, which will
+    // slow us down later.
+    memset(&ret, 0, sizeof(ret));
+
     for (int i = 0; i < 7; ++i) {
         const auto& output_register_map = registers.vs_output_attributes[i];
 
@@ -270,18 +485,15 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes)
             state.output_register_table[4*i+comp] = ((float24*)&ret) + semantics[comp];
     }
 
-    state.status_registers[0] = false;
-    state.status_registers[1] = false;
-    std::fill(state.call_stack, state.call_stack + sizeof(state.call_stack) / sizeof(state.call_stack[0]),
-              VertexShaderState::INVALID_ADDRESS);
-    state.call_stack_pointer = &state.call_stack[0];
+    state.conditional_code[0] = false;
+    state.conditional_code[1] = false;
 
     ProcessShaderCode(state);
-    DebugUtils::DumpShader(shader_memory, state.debug.max_offset, swizzle_data,
+    DebugUtils::DumpShader(shader_memory.data(), state.debug.max_offset, swizzle_data.data(),
                            state.debug.max_opdesc_id, registers.vs_main_offset,
                            registers.vs_output_attributes);
 
-    DEBUG_LOG(GPU, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
+    LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
         ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
         ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),
         ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32());
diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h
index bfb6fb6e3..af3fb2a2f 100644
--- a/src/video_core/vertex_shader.h
+++ b/src/video_core/vertex_shader.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once
@@ -27,15 +27,18 @@ struct OutputVertex {
     Math::Vec4<float24> dummy; // quaternions (not implemented, yet)
     Math::Vec4<float24> color;
     Math::Vec2<float24> tc0;
+    Math::Vec2<float24> tc1;
+    float24 pad[6];
+    Math::Vec2<float24> tc2;
 
     // Padding for optimal alignment
-    float24 pad[14];
+    float24 pad2[4];
 
     // Attributes used to store intermediate results
 
     // position after perspective divide
     Math::Vec3<float24> screenpos;
-    float24 pad2;
+    float24 pad3;
 
     // Linear interpolation
     // factor: 0=this, 1=vtx
@@ -44,6 +47,8 @@ struct OutputVertex {
 
         // TODO: Should perform perspective correct interpolation here...
         tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
+        tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor);
+        tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor);
 
         screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
 
@@ -61,222 +66,16 @@ struct OutputVertex {
 static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
 static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
 
-union Instruction {
-    enum class OpCode : u32 {
-        ADD = 0x0,
-        DP3 = 0x1,
-        DP4 = 0x2,
-
-        MUL = 0x8,
-
-        MAX = 0xC,
-        MIN = 0xD,
-        RCP = 0xE,
-        RSQ = 0xF,
-
-        MOV = 0x13,
-
-        RET = 0x21,
-        FLS = 0x22, // Flush
-        CALL = 0x24,
-    };
-
-    std::string GetOpCodeName() const {
-        std::map<OpCode, std::string> map = {
-            { OpCode::ADD, "ADD" },
-            { OpCode::DP3, "DP3" },
-            { OpCode::DP4, "DP4" },
-            { OpCode::MUL, "MUL" },
-            { OpCode::MAX, "MAX" },
-            { OpCode::MIN, "MIN" },
-            { OpCode::RCP, "RCP" },
-            { OpCode::RSQ, "RSQ" },
-            { OpCode::MOV, "MOV" },
-            { OpCode::RET, "RET" },
-            { OpCode::FLS, "FLS" },
-        };
-        auto it = map.find(opcode);
-        if (it == map.end())
-            return "UNK";
-        else
-            return it->second;
-    }
-
-    u32 hex;
-
-    BitField<0x1a, 0x6, OpCode> opcode;
-
-    // General notes:
-    //
-    // When two input registers are used, one of them uses a 5-bit index while the other
-    // one uses a 7-bit index. This is because at most one floating point uniform may be used
-    // as an input.
-
-
-    // Format used e.g. by arithmetic instructions and comparisons
-    // "src1" and "src2" specify register indices (i.e. indices referring to groups of 4 floats),
-    // while "dest" addresses individual floats.
-    union {
-        BitField<0x00, 0x5, u32> operand_desc_id;
-
-        template<class BitFieldType>
-        struct SourceRegister : BitFieldType {
-            enum RegisterType {
-                Input,
-                Temporary,
-                FloatUniform
-            };
-
-            RegisterType GetRegisterType() const {
-                if (BitFieldType::Value() < 0x10)
-                    return Input;
-                else if (BitFieldType::Value() < 0x20)
-                    return Temporary;
-                else
-                    return FloatUniform;
-            }
-
-            int GetIndex() const {
-                if (GetRegisterType() == Input)
-                    return BitFieldType::Value();
-                else if (GetRegisterType() == Temporary)
-                    return BitFieldType::Value() - 0x10;
-                else // if (GetRegisterType() == FloatUniform)
-                    return BitFieldType::Value() - 0x20;
-            }
-
-            std::string GetRegisterName() const {
-                std::map<RegisterType, std::string> type = {
-                    { Input, "i" },
-                    { Temporary, "t" },
-                    { FloatUniform, "f" },
-                };
-                return type[GetRegisterType()] + std::to_string(GetIndex());
-            }
-        };
-
-        SourceRegister<BitField<0x07, 0x5, u32>> src2;
-        SourceRegister<BitField<0x0c, 0x7, u32>> src1;
-
-        struct : BitField<0x15, 0x5, u32>
-        {
-            enum RegisterType {
-                Output,
-                Temporary,
-                Unknown
-            };
-            RegisterType GetRegisterType() const {
-                if (Value() < 0x8)
-                    return Output;
-                else if (Value() < 0x10)
-                    return Unknown;
-                else
-                    return Temporary;
-            }
-            int GetIndex() const {
-                if (GetRegisterType() == Output)
-                    return Value();
-                else if (GetRegisterType() == Temporary)
-                    return Value() - 0x10;
-                else
-                    return Value();
-            }
-            std::string GetRegisterName() const {
-                std::map<RegisterType, std::string> type = {
-                    { Output, "o" },
-                    { Temporary, "t" },
-                    { Unknown, "u" }
-                };
-                return type[GetRegisterType()] + std::to_string(GetIndex());
-            }
-        } dest;
-    } common;
-
-    // Format used for flow control instructions ("if")
-    union {
-        BitField<0x00, 0x8, u32> num_instructions;
-        BitField<0x0a, 0xc, u32> offset_words;
-    } flow_control;
-};
-static_assert(std::is_standard_layout<Instruction>::value, "Structure is not using standard layout!");
-
-union SwizzlePattern {
-    u32 hex;
-
-    enum class Selector : u32 {
-        x = 0,
-        y = 1,
-        z = 2,
-        w = 3
-    };
-
-    Selector GetSelectorSrc1(int comp) const {
-        Selector selectors[] = {
-            src1_selector_0, src1_selector_1, src1_selector_2, src1_selector_3
-        };
-        return selectors[comp];
-    }
-
-    Selector GetSelectorSrc2(int comp) const {
-        Selector selectors[] = {
-            src2_selector_0, src2_selector_1, src2_selector_2, src2_selector_3
-        };
-        return selectors[comp];
-    }
-
-    bool DestComponentEnabled(int i) const {
-        return (dest_mask & (0x8 >> i)) != 0;
-    }
-
-    std::string SelectorToString(bool src2) const {
-        std::map<Selector, std::string> map = {
-            { Selector::x, "x" },
-            { Selector::y, "y" },
-            { Selector::z, "z" },
-            { Selector::w, "w" }
-        };
-        std::string ret;
-        for (int i = 0; i < 4; ++i) {
-            ret += map.at(src2 ? GetSelectorSrc2(i) : GetSelectorSrc1(i));
-        }
-        return ret;
-    }
-
-    std::string DestMaskToString() const {
-        std::string ret;
-        for (int i = 0; i < 4; ++i) {
-            if (!DestComponentEnabled(i))
-                ret += "_";
-            else
-                ret += "xyzw"[i];
-        }
-        return ret;
-    }
-
-    // Components of "dest" that should be written to: LSB=dest.w, MSB=dest.x
-    BitField< 0, 4, u32> dest_mask;
-
-    BitField< 4, 1, u32> negate; // negates src1
-
-    BitField< 5, 2, Selector> src1_selector_3;
-    BitField< 7, 2, Selector> src1_selector_2;
-    BitField< 9, 2, Selector> src1_selector_1;
-    BitField<11, 2, Selector> src1_selector_0;
-
-    BitField<14, 2, Selector> src2_selector_3;
-    BitField<16, 2, Selector> src2_selector_2;
-    BitField<18, 2, Selector> src2_selector_1;
-    BitField<20, 2, Selector> src2_selector_0;
-
-    BitField<31, 1, u32> flag; // not sure what this means, maybe it's the sign?
-};
-
 void SubmitShaderMemoryChange(u32 addr, u32 value);
 void SubmitSwizzleDataChange(u32 addr, u32 value);
 
 OutputVertex RunShader(const InputVertex& input, int num_attributes);
 
 Math::Vec4<float24>& GetFloatUniform(u32 index);
+bool& GetBoolUniform(u32 index);
+
+const std::array<u32, 1024>& GetShaderBinary();
+const std::array<u32, 1024>& GetSwizzlePatterns();
 
 } // namespace
 
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index c779771c5..c9707e5f1 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #include "common/common.h"
@@ -17,8 +17,8 @@
 
 namespace VideoCore {
 
-EmuWindow*      g_emu_window    = NULL;     ///< Frontend emulator window
-RendererBase*   g_renderer      = NULL;     ///< Renderer plugin
+EmuWindow*      g_emu_window    = nullptr;     ///< Frontend emulator window
+RendererBase*   g_renderer      = nullptr;     ///< Renderer plugin
 int             g_current_frame = 0;
 
 /// Initialize the video core
@@ -30,13 +30,13 @@ void Init(EmuWindow* emu_window) {
 
     g_current_frame = 0;
 
-    NOTICE_LOG(VIDEO, "initialized OK");
+    LOG_DEBUG(Render, "initialized OK");
 }
 
 /// Shutdown the video core
 void Shutdown() {
     delete g_renderer;
-    NOTICE_LOG(VIDEO, "shutdown OK");
+    LOG_DEBUG(Render, "shutdown OK");
 }
 
 } // namespace
diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h
index 609aac513..b782f17bd 100644
--- a/src/video_core/video_core.h
+++ b/src/video_core/video_core.h
@@ -1,5 +1,5 @@
 // Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
+// Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
 #pragma once