27 files changed, 2695 insertions, 627 deletions
diff --git a/src/citra_qt/debugger/graphics_cmdlists.cpp b/src/citra_qt/debugger/graphics_cmdlists.cpp
index 195197ef5..e98560a19 100644
--- a/src/citra_qt/debugger/graphics_cmdlists.cpp
+++ b/src/citra_qt/debugger/graphics_cmdlists.cpp
@@ -78,12 +78,12 @@ QVariant GPUCommandListModel::data(const QModelIndex& index, int role) const
         // index refers to a specific command
         const GraphicsDebugger::PicaCommandList& cmdlist = command_lists[item->parent->index].second;
         const GraphicsDebugger::PicaCommand& cmd = cmdlist[item->index];
-        const Pica::CommandHeader& header = cmd.GetHeader();
+        const Pica::CommandProcessor::CommandHeader& header = cmd.GetHeader();
 
         if (role == Qt::DisplayRole) {
             QString content;
             if (index.column() == 0) {
-                content = Pica::command_names[header.cmd_id];
+                content = QString::fromLatin1(Pica::Regs::GetCommandName(header.cmd_id).c_str());
                 content.append(" ");
             } else if (index.column() == 1) {
                 for (int j = 0; j < cmd.size(); ++j)
diff --git a/src/common/common.vcxproj b/src/common/common.vcxproj
index 1f5c714c3..341d3a813 100644
--- a/src/common/common.vcxproj
+++ b/src/common/common.vcxproj
@@ -182,7 +182,6 @@
     <ClInclude Include="mem_arena.h" />
     <ClInclude Include="msg_handler.h" />
     <ClInclude Include="platform.h" />
-    <ClInclude Include="register_set.h" />
     <ClInclude Include="scm_rev.h" />
     <ClInclude Include="std_condition_variable.h" />
     <ClInclude Include="std_mutex.h" />
diff --git a/src/common/common.vcxproj.filters b/src/common/common.vcxproj.filters
index e8c4ce360..59268ce5a 100644
--- a/src/common/common.vcxproj.filters
+++ b/src/common/common.vcxproj.filters
@@ -29,7 +29,6 @@
     <ClInclude Include="memory_util.h" />
     <ClInclude Include="msg_handler.h" />
     <ClInclude Include="platform.h" />
-    <ClInclude Include="register_set.h" />
     <ClInclude Include="std_condition_variable.h" />
     <ClInclude Include="std_mutex.h" />
     <ClInclude Include="std_thread.h" />
diff --git a/src/common/register_set.h b/src/common/register_set.h
deleted file mode 100644
index ba19a2614..000000000
--- a/src/common/register_set.h
+++ /dev/null
@@ -1,163 +0,0 @@
-// Copyright 2014 Citra Emulator Project
-// Licensed under GPLv2
-// Refer to the license.txt file included.
-
-#pragma once
-
-// Copyright 2014 Tony Wasserka
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-//     * Redistributions of source code must retain the above copyright
-//       notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above copyright
-//       notice, this list of conditions and the following disclaimer in the
-//       documentation and/or other materials provided with the distribution.
-//     * Neither the name of the owner nor the names of its contributors may
-//       be used to endorse or promote products derived from this software
-//       without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-/*
- * Standardized way to define a group of registers and corresponding data structures. To define
- * a new register set, first define struct containing an enumeration called "Id" containing
- * all register IDs and a template struct called "Struct". Specialize the Struct struct for any
- * register ID which needs to be accessed in a specialized way. You can then declare the object
- * containing all register values using the RegisterSet<BaseType, DefiningStruct> type, where
- * BaseType is the underlying type of each register (e.g. u32).
- * Of course, you'll usually want to implement the Struct template such that they are of the same
- * size as BaseType. However, it's also possible to make it larger, e.g. when you want to describe
- * multiple registers with the same structure.
- *
- * Example:
- *
- *     struct Regs {
- *         enum Id : u32 {
- *             Value1 = 0,
- *             Value2 = 1,
- *             Value3 = 2,
- *             NumIds = 3
- *         };
- *
- *         // declare register definition structures
- *         template<Id id>
- *         struct Struct;
- *     };
- *
- *     // Define register set object
- *     RegisterSet<u32, CommandIds> registers;
- *
- *     // define register definition structures
- *     template<>
- *     struct Regs::Struct<Regs::Value1> {
- *         union {
- *             BitField<0, 4, u32> some_field;
- *             BitField<4, 3, u32> some_other_field;
- *         };
- *     };
- *
- * Usage in external code (within SomeNamespace scope):
- *
- *     For a register which maps to a single index:
- *     registers.Get<Regs::Value1>().some_field = some_value;
- *
- *      For a register which maps to different indices, e.g. a group of similar registers
- *     registers.Get<Regs::Value1>(index).some_field = some_value;
- *
- *
- * @tparam BaseType Base type used for storing individual registers, e.g. u32
- * @tparam RegDefinition Class defining an enumeration called "Id" and a template<Id id> struct, as described above.
- * @note RegDefinition::Id needs to have an enum value called NumIds defining the number of registers to be allocated.
- */
-template<typename BaseType, typename RegDefinition>
-struct RegisterSet {
-    // Register IDs
-    using Id = typename RegDefinition::Id;
-
-    // type used for *this
-    using ThisType = RegisterSet<BaseType, RegDefinition>;
-
-    // Register definition structs, defined in RegDefinition
-    template<Id id>
-    using Struct = typename RegDefinition::template Struct<id>;
-
-
-    /*
-     * Lookup register with the given id and return it as the corresponding structure type.
-     * @note This just forwards the arguments to Get(Id).
-     */
-    template<Id id>
-    const Struct<id>& Get() const {
-        return Get<id>(id);
-    }
-
-    /*
-     * Lookup register with the given id and return it as the corresponding structure type.
-     * @note This just forwards the arguments to Get(Id).
-     */
-    template<Id id>
-    Struct<id>& Get() {
-        return Get<id>(id);
-    }
-
-    /*
-     * Lookup register with the given index and return it as the corresponding structure type.
-     * @todo Is this portable with regards to structures larger than BaseType?
-     * @note if index==id, you don't need to specify the function parameter.
-     */
-    template<Id id>
-    const Struct<id>& Get(const Id& index) const {
-        const int idx = static_cast<size_t>(index);
-        return *reinterpret_cast<const Struct<id>*>(&raw[idx]);
-    }
-
-    /*
-     * Lookup register with the given index and return it as the corresponding structure type.
-     * @note This just forwards the arguments to the const version of Get(Id).
-     * @note if index==id, you don't need to specify the function parameter.
-     */
-    template<Id id>
-    Struct<id>& Get(const Id& index) {
-        return const_cast<Struct<id>&>(GetThis().Get<id>(index));
-    }
-
-    /*
-     * Plain array access.
-     * @note If you want to have this casted to a register defininition struct, use Get() instead.
-     */
-    const BaseType& operator[] (const Id& id) const {
-        return raw[static_cast<size_t>(id)];
-    }
-
-    /*
-     * Plain array access.
-     * @note If you want to have this casted to a register defininition struct, use Get() instead.
-     * @note This operator just forwards its argument to the const version.
-     */
-    BaseType& operator[] (const Id& id) {
-        return const_cast<BaseType&>(GetThis()[id]);
-    }
-
-private:
-    /*
-     * Returns a const reference to "this".
-     */
-    const ThisType& GetThis() const {
-        return static_cast<const ThisType&>(*this);
-    }
-
-    BaseType raw[Id::NumIds];
-};
diff --git a/src/core/hle/service/gsp.cpp b/src/core/hle/service/gsp.cpp
index e241b31c8..635f50a53 100644
--- a/src/core/hle/service/gsp.cpp
+++ b/src/core/hle/service/gsp.cpp
@@ -32,7 +32,7 @@ static inline u8* GetCommandBuffer(u32 thread_id) {
     if (0 == g_shared_memory)
         return nullptr;
 
-    return Kernel::GetSharedMemoryPointer(g_shared_memory, 
+    return Kernel::GetSharedMemoryPointer(g_shared_memory,
         0x800 + (thread_id * sizeof(CommandBuffer)));
 }
 
@@ -173,11 +173,11 @@ void ExecuteCommand(const Command& command) {
     case CommandId::SET_COMMAND_LIST_LAST:
     {
         auto& params = command.set_command_list_last;
-        WriteGPURegister(GPU::Regs::CommandProcessor + 2, params.address >> 3);
-        WriteGPURegister(GPU::Regs::CommandProcessor, params.size >> 3);
+        WriteGPURegister(GPU_REG_INDEX(command_processor_config.address), Memory::VirtualToPhysicalAddress(params.address) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(command_processor_config.size), params.size >> 3);
 
         // TODO: Not sure if we are supposed to always write this .. seems to trigger processing though
-        WriteGPURegister(GPU::Regs::CommandProcessor + 4, 1);
+        WriteGPURegister(GPU_REG_INDEX(command_processor_config.trigger), 1);
 
         // TODO: Move this to GPU
         // TODO: Not sure what units the size is measured in
@@ -193,20 +193,28 @@ void ExecuteCommand(const Command& command) {
     case CommandId::SET_MEMORY_FILL:
     {
         auto& params = command.memory_fill;
-        WriteGPURegister(GPU::Regs::MemoryFill, params.start1 >> 3);
-        WriteGPURegister(GPU::Regs::MemoryFill + 1, params.end1 >> 3);
-        WriteGPURegister(GPU::Regs::MemoryFill + 2, params.end1 - params.start1);
-        WriteGPURegister(GPU::Regs::MemoryFill + 3, params.value1);
-
-        WriteGPURegister(GPU::Regs::MemoryFill + 4, params.start2 >> 3);
-        WriteGPURegister(GPU::Regs::MemoryFill + 5, params.end2 >> 3);
-        WriteGPURegister(GPU::Regs::MemoryFill + 6, params.end2 - params.start2);
-        WriteGPURegister(GPU::Regs::MemoryFill + 7, params.value2);
+        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].address_start), Memory::VirtualToPhysicalAddress(params.start1) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].address_end), Memory::VirtualToPhysicalAddress(params.end1) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].size), params.end1 - params.start1);
+        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[0].value), params.value1);
+
+        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].address_start), Memory::VirtualToPhysicalAddress(params.start2) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].address_end), Memory::VirtualToPhysicalAddress(params.end2) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].size), params.end2 - params.start2);
+        WriteGPURegister(GPU_REG_INDEX(memory_fill_config[1].value), params.value2);
         break;
     }
 
-    // TODO: Check if texture copies are implemented correctly..
     case CommandId::SET_DISPLAY_TRANSFER:
+    {
+        auto& params = command.image_copy;
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_address), Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_address), Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_size), params.in_buffer_size);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_size), params.out_buffer_size);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.flags), params.flags);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.trigger), 1);
+
         // TODO(bunnei): Signalling all of these interrupts here is totally wrong, but it seems to
         // work well enough for running demos. Need to figure out how these all work and trigger
         // them correctly.
@@ -216,19 +224,20 @@ void ExecuteCommand(const Command& command) {
         SignalInterrupt(InterruptId::P3D);
         SignalInterrupt(InterruptId::DMA);
         break;
+    }
 
+    // TODO: Check if texture copies are implemented correctly..
     case CommandId::SET_TEXTURE_COPY:
     {
         auto& params = command.image_copy;
-        WriteGPURegister(GPU::Regs::DisplayTransfer, params.in_buffer_address >> 3);
-        WriteGPURegister(GPU::Regs::DisplayTransfer + 1, params.out_buffer_address >> 3);
-        WriteGPURegister(GPU::Regs::DisplayTransfer + 3, params.in_buffer_size);
-        WriteGPURegister(GPU::Regs::DisplayTransfer + 2, params.out_buffer_size);
-        WriteGPURegister(GPU::Regs::DisplayTransfer + 4, params.flags);
-
-        // TODO: Should this only be ORed with 1 for texture copies?
-        // trigger transfer
-        WriteGPURegister(GPU::Regs::DisplayTransfer + 6, 1);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_address), Memory::VirtualToPhysicalAddress(params.in_buffer_address) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_address), Memory::VirtualToPhysicalAddress(params.out_buffer_address) >> 3);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.input_size), params.in_buffer_size);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.output_size), params.out_buffer_size);
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.flags), params.flags);
+
+        // TODO: Should this register be set to 1 or should instead its value be OR-ed with 1?
+        WriteGPURegister(GPU_REG_INDEX(display_transfer_config.trigger), 1);
         break;
     }
 
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index d94c2329b..87cf93bac 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -14,106 +14,29 @@
 
 #include "core/hw/gpu.h"
 
+#include "video_core/command_processor.h"
 #include "video_core/video_core.h"
 
 
 namespace GPU {
 
-RegisterSet<u32, Regs> g_regs;
+Regs g_regs;
 
 u32 g_cur_line = 0;         ///< Current vertical screen line
 u64 g_last_line_ticks = 0;  ///< CPU tick count from last vertical screen line
 
-/**
- * Sets whether the framebuffers are in the GSP heap (FCRAM) or VRAM
- * @param
- */
-void SetFramebufferLocation(const FramebufferLocation mode) {
-    switch (mode) {
-    case FRAMEBUFFER_LOCATION_FCRAM:
-    {
-        auto& framebuffer_top = g_regs.Get<Regs::FramebufferTop>();
-        auto& framebuffer_sub = g_regs.Get<Regs::FramebufferBottom>();
-
-        framebuffer_top.address_left1  = PADDR_TOP_LEFT_FRAME1;
-        framebuffer_top.address_left2  = PADDR_TOP_LEFT_FRAME2;
-        framebuffer_top.address_right1 = PADDR_TOP_RIGHT_FRAME1;
-        framebuffer_top.address_right2 = PADDR_TOP_RIGHT_FRAME2;
-        framebuffer_sub.address_left1  = PADDR_SUB_FRAME1;
-        //framebuffer_sub.address_left2  = unknown;
-        framebuffer_sub.address_right1 = PADDR_SUB_FRAME2;
-        //framebuffer_sub.address_right2 = unknown;
-        break;
-    }
-
-    case FRAMEBUFFER_LOCATION_VRAM:
-    {
-        auto& framebuffer_top = g_regs.Get<Regs::FramebufferTop>();
-        auto& framebuffer_sub = g_regs.Get<Regs::FramebufferBottom>();
-
-        framebuffer_top.address_left1  = PADDR_VRAM_TOP_LEFT_FRAME1;
-        framebuffer_top.address_left2  = PADDR_VRAM_TOP_LEFT_FRAME2;
-        framebuffer_top.address_right1 = PADDR_VRAM_TOP_RIGHT_FRAME1;
-        framebuffer_top.address_right2 = PADDR_VRAM_TOP_RIGHT_FRAME2;
-        framebuffer_sub.address_left1  = PADDR_VRAM_SUB_FRAME1;
-        //framebuffer_sub.address_left2  = unknown;
-        framebuffer_sub.address_right1 = PADDR_VRAM_SUB_FRAME2;
-        //framebuffer_sub.address_right2 = unknown;
-        break;
-    }
-    }
-}
-
-/**
- * Gets the location of the framebuffers
- * @return Location of framebuffers as FramebufferLocation enum
- */
-FramebufferLocation GetFramebufferLocation(u32 address) {
-    if ((address & ~Memory::VRAM_MASK) == Memory::VRAM_PADDR) {
-        return FRAMEBUFFER_LOCATION_VRAM;
-    } else if ((address & ~Memory::FCRAM_MASK) == Memory::FCRAM_PADDR) {
-        return FRAMEBUFFER_LOCATION_FCRAM;
-    } else {
-        ERROR_LOG(GPU, "unknown framebuffer location!");
-    }
-    return FRAMEBUFFER_LOCATION_UNKNOWN;
-}
-
-u32 GetFramebufferAddr(const u32 address) {
-    switch (GetFramebufferLocation(address)) {
-    case FRAMEBUFFER_LOCATION_FCRAM:
-        return Memory::VirtualAddressFromPhysical_FCRAM(address);
-    case FRAMEBUFFER_LOCATION_VRAM:
-        return Memory::VirtualAddressFromPhysical_VRAM(address);
-    default:
-        ERROR_LOG(GPU, "unknown framebuffer location");
-    }
-    return 0;
-}
-
-/**
- * Gets a read-only pointer to a framebuffer in memory
- * @param address Physical address of framebuffer
- * @return Returns const pointer to raw framebuffer
- */
-const u8* GetFramebufferPointer(const u32 address) {
-    u32 addr = GetFramebufferAddr(address);
-    return (addr != 0) ? Memory::GetPointer(addr) : nullptr;
-}
-
 template <typename T>
 inline void Read(T &var, const u32 raw_addr) {
     u32 addr = raw_addr - 0x1EF00000;
     int index = addr / 4;
 
     // Reads other than u32 are untested, so I'd rather have them abort than silently fail
-    if (index >= Regs::NumIds || !std::is_same<T,u32>::value)
-    {
+    if (index >= Regs::NumIds() || !std::is_same<T,u32>::value) {
         ERROR_LOG(GPU, "unknown Read%d @ 0x%08X", sizeof(var) * 8, addr);
         return;
     }
 
-    var = g_regs[static_cast<Regs::Id>(addr / 4)];
+    var = g_regs[addr / 4];
 }
 
 template <typename T>
@@ -122,28 +45,28 @@ inline void Write(u32 addr, const T data) {
     int index = addr / 4;
 
     // Writes other than u32 are untested, so I'd rather have them abort than silently fail
-    if (index >= Regs::NumIds || !std::is_same<T,u32>::value)
-    {
+    if (index >= Regs::NumIds() || !std::is_same<T,u32>::value) {
         ERROR_LOG(GPU, "unknown Write%d 0x%08X @ 0x%08X", sizeof(data) * 8, data, addr);
         return;
     }
 
-    g_regs[static_cast<Regs::Id>(index)] = data;
+    g_regs[index] = data;
 
-    switch (static_cast<Regs::Id>(index)) {
+    switch (index) {
 
     // Memory fills are triggered once the fill value is written.
     // NOTE: This is not verified.
-    case Regs::MemoryFill + 3:
-    case Regs::MemoryFill + 7:
+    case GPU_REG_INDEX_WORKAROUND(memory_fill_config[0].value, 0x00004 + 0x3):
+    case GPU_REG_INDEX_WORKAROUND(memory_fill_config[1].value, 0x00008 + 0x3):
     {
-        const auto& config = g_regs.Get<Regs::MemoryFill>(static_cast<Regs::Id>(index - 3));
+        const bool is_second_filler = (index != GPU_REG_INDEX(memory_fill_config[0].value));
+        const auto& config = g_regs.memory_fill_config[is_second_filler];
 
         // TODO: Not sure if this check should be done at GSP level instead
         if (config.address_start) {
             // TODO: Not sure if this algorithm is correct, particularly because it doesn't use the size member at all
-            u32* start = (u32*)Memory::GetPointer(config.GetStartAddress());
-            u32* end = (u32*)Memory::GetPointer(config.GetEndAddress());
+            u32* start = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetStartAddress()));
+            u32* end = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetEndAddress()));
             for (u32* ptr = start; ptr < end; ++ptr)
                 *ptr = bswap32(config.value); // TODO: This is just a workaround to missing framebuffer format emulation
 
@@ -152,12 +75,12 @@ inline void Write(u32 addr, const T data) {
         break;
     }
 
-    case Regs::DisplayTransfer + 6:
+    case GPU_REG_INDEX(display_transfer_config.trigger):
     {
-        const auto& config = g_regs.Get<Regs::DisplayTransfer>();
+        const auto& config = g_regs.display_transfer_config;
         if (config.trigger & 1) {
-            u8* source_pointer = Memory::GetPointer(config.GetPhysicalInputAddress());
-            u8* dest_pointer = Memory::GetPointer(config.GetPhysicalOutputAddress());
+            u8* source_pointer = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetPhysicalInputAddress()));
+            u8* dest_pointer = Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetPhysicalOutputAddress()));
 
             for (int y = 0; y < config.output_height; ++y) {
                 // TODO: Why does the register seem to hold twice the framebuffer width?
@@ -221,14 +144,15 @@ inline void Write(u32 addr, const T data) {
         break;
     }
 
-    case Regs::CommandProcessor + 4:
+    // Seems like writing to this register triggers processing
+    case GPU_REG_INDEX(command_processor_config.trigger):
     {
-        const auto& config = g_regs.Get<Regs::CommandProcessor>();
+        const auto& config = g_regs.command_processor_config;
         if (config.trigger & 1)
         {
-            // u32* buffer = (u32*)Memory::GetPointer(config.address << 3);
-            ERROR_LOG(GPU, "Beginning 0x%08x bytes of commands from address 0x%08x", config.size, config.address << 3);
-            // TODO: Process command list!
+            u32* buffer = (u32*)Memory::GetPointer(Memory::PhysicalToVirtualAddress(config.GetPhysicalAddress()));
+            u32 size = config.size << 3;
+            Pica::CommandProcessor::ProcessCommandList(buffer, size);
         }
         break;
     }
@@ -252,7 +176,7 @@ template void Write<u8>(u32 addr, const u8 data);
 
 /// Update hardware
 void Update() {
-    auto& framebuffer_top = g_regs.Get<Regs::FramebufferTop>();
+    auto& framebuffer_top = g_regs.framebuffer_config[0];
     u64 current_ticks = Core::g_app_core->GetTicks();
 
     // Synchronize line...
@@ -277,11 +201,22 @@ void Init() {
     g_cur_line = 0;
     g_last_line_ticks = Core::g_app_core->GetTicks();
 
-//    SetFramebufferLocation(FRAMEBUFFER_LOCATION_FCRAM);
-    SetFramebufferLocation(FRAMEBUFFER_LOCATION_VRAM);
+    auto& framebuffer_top = g_regs.framebuffer_config[0];
+    auto& framebuffer_sub = g_regs.framebuffer_config[1];
+
+    // Setup default framebuffer addresses (located in VRAM)
+    // .. or at least these are the ones used by system applets.
+    // There's probably a smarter way to come up with addresses
+    // like this which does not require hardcoding.
+    framebuffer_top.address_left1  = 0x181E6000;
+    framebuffer_top.address_left2  = 0x1822C800;
+    framebuffer_top.address_right1 = 0x18273000;
+    framebuffer_top.address_right2 = 0x182B9800;
+    framebuffer_sub.address_left1  = 0x1848F000;
+    //framebuffer_sub.address_left2  = unknown;
+    framebuffer_sub.address_right1 = 0x184C7800;
+    //framebuffer_sub.address_right2 = unknown;
 
-    auto& framebuffer_top = g_regs.Get<Regs::FramebufferTop>();
-    auto& framebuffer_sub = g_regs.Get<Regs::FramebufferBottom>();
     // TODO: Width should be 240 instead?
     framebuffer_top.width = 480;
     framebuffer_top.height = 400;
diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
index 42f18a0e7..d20311a00 100644
--- a/src/core/hw/gpu.h
+++ b/src/core/hw/gpu.h
@@ -4,32 +4,57 @@
 
 #pragma once
 
+#include <cstddef>
+
 #include "common/common_types.h"
 #include "common/bit_field.h"
-#include "common/register_set.h"
 
 namespace GPU {
 
 static const u32 kFrameCycles   = 268123480 / 60;   ///< 268MHz / 60 frames per second
 static const u32 kFrameTicks    = kFrameCycles / 3; ///< Approximate number of instructions/frame
 
+// Returns index corresponding to the Regs member labeled by field_name
+// TODO: Due to Visual studio bug 209229, offsetof does not return constant expressions
+//       when used with array elements (e.g. GPU_REG_INDEX(memory_fill_config[0])).
+//       For details cf. https://connect.microsoft.com/VisualStudio/feedback/details/209229/offsetof-does-not-produce-a-constant-expression-for-array-members
+//       Hopefully, this will be fixed sometime in the future.
+//       For lack of better alternatives, we currently hardcode the offsets when constant
+//       expressions are needed via GPU_REG_INDEX_WORKAROUND (on sane compilers, static_asserts
+//       will then make sure the offsets indeed match the automatically calculated ones).
+#define GPU_REG_INDEX(field_name) (offsetof(GPU::Regs, field_name) / sizeof(u32))
+#if defined(_MSC_VER)
+#define GPU_REG_INDEX_WORKAROUND(field_name, backup_workaround_index) (backup_workaround_index)
+#else
+// NOTE: Yeah, hacking in a static_assert here just to workaround the lacking MSVC compiler
+//       really is this annoying. This macro just forwards its first argument to GPU_REG_INDEX
+//       and then performs a (no-op) cast to size_t iff the second argument matches the expected
+//       field offset. Otherwise, the compiler will fail to compile this code.
+#define GPU_REG_INDEX_WORKAROUND(field_name, backup_workaround_index) \
+    ((typename std::enable_if<backup_workaround_index == GPU_REG_INDEX(field_name), size_t>::type)GPU_REG_INDEX(field_name))
+#endif
+
 // MMIO region 0x1EFxxxxx
 struct Regs {
-    enum Id : u32 {
-        MemoryFill                = 0x00004, // + 5,6,7; second block at 8-11
-
-        FramebufferTop            = 0x00117, // + 11a,11b,11c,11d(?),11e...126
-        FramebufferBottom         = 0x00157, // + 15a,15b,15c,15d(?),15e...166
 
-        DisplayTransfer           = 0x00300, // + 301,302,303,304,305,306
-
-        CommandProcessor          = 0x00638, // + 63a,63c
-
-        NumIds                    = 0x01000
-    };
-
-    template<Id id>
-    struct Struct;
+// helper macro to properly align structure members.
+// Calling INSERT_PADDING_WORDS will add a new member variable with a name like "pad121",
+// depending on the current source line to make sure variable names are unique.
+#define INSERT_PADDING_WORDS_HELPER1(x, y) x ## y
+#define INSERT_PADDING_WORDS_HELPER2(x, y) INSERT_PADDING_WORDS_HELPER1(x, y)
+#define INSERT_PADDING_WORDS(num_words) u32 INSERT_PADDING_WORDS_HELPER2(pad, __LINE__)[(num_words)];
+
+// helper macro to make sure the defined structures are of the expected size.
+#if defined(_MSC_VER)
+// TODO: MSVC does not support using sizeof() on non-static data members even though this
+//       is technically allowed since C++11. This macro should be enabled once MSVC adds
+//       support for that.
+#define ASSERT_MEMBER_SIZE(name, size_in_bytes)
+#else
+#define ASSERT_MEMBER_SIZE(name, size_in_bytes)  \
+    static_assert(sizeof(name) == size_in_bytes, \
+                  "Structure size and register block length don't match");
+#endif
 
     enum class FramebufferFormat : u32 {
         RGBA8  = 0,
@@ -38,201 +63,191 @@ struct Regs {
         RGB5A1 = 3,
         RGBA4  = 4,
     };
-};
 
-template<>
-struct Regs::Struct<Regs::MemoryFill> {
-    u32 address_start;
-    u32 address_end; // ?
-    u32 size;
-    u32 value; // ?
+    INSERT_PADDING_WORDS(0x4);
 
-    inline u32 GetStartAddress() const {
-        return address_start * 8;
-    }
+    struct {
+        u32 address_start;
+        u32 address_end; // ?
+        u32 size;
+        u32 value; // ?
 
-    inline u32 GetEndAddress() const {
-        return address_end * 8;
-    }
-};
-static_assert(sizeof(Regs::Struct<Regs::MemoryFill>) == 0x10, "Structure size and register block length don't match");
+        inline u32 GetStartAddress() const {
+            return DecodeAddressRegister(address_start);
+        }
 
-template<>
-struct Regs::Struct<Regs::FramebufferTop> {
-    using Format = Regs::FramebufferFormat;
+        inline u32 GetEndAddress() const {
+            return DecodeAddressRegister(address_end);
+        }
+    } memory_fill_config[2];
+    ASSERT_MEMBER_SIZE(memory_fill_config[0], 0x10);
 
-    union {
-        u32 size;
+    INSERT_PADDING_WORDS(0x10b);
 
-        BitField< 0, 16, u32> width;
-        BitField<16, 16, u32> height;
-    };
+    struct {
+        using Format = Regs::FramebufferFormat;
 
-    u32 pad0[2];
+        union {
+            u32 size;
 
-    u32 address_left1;
-    u32 address_left2;
+            BitField< 0, 16, u32> width;
+            BitField<16, 16, u32> height;
+        };
 
-    union {
-        u32 format;
+        INSERT_PADDING_WORDS(0x2);
 
-        BitField< 0, 3, Format> color_format;
-    };
+        u32 address_left1;
+        u32 address_left2;
 
-    u32 pad1;
+        union {
+            u32 format;
 
-    union {
-        u32 active_fb;
+            BitField< 0, 3, Format> color_format;
+        };
 
-        // 0: Use parameters ending with "1"
-        // 1: Use parameters ending with "2"
-        BitField<0, 1, u32> second_fb_active;
-    };
+        INSERT_PADDING_WORDS(0x1);
 
-    u32 pad2[5];
+        union {
+            u32 active_fb;
 
-    // Distance between two pixel rows, in bytes
-    u32 stride;
+            // 0: Use parameters ending with "1"
+            // 1: Use parameters ending with "2"
+            BitField<0, 1, u32> second_fb_active;
+        };
 
-    u32 address_right1;
-    u32 address_right2;
-};
+        INSERT_PADDING_WORDS(0x5);
 
-template<>
-struct Regs::Struct<Regs::FramebufferBottom> : public Regs::Struct<Regs::FramebufferTop> {
-};
-static_assert(sizeof(Regs::Struct<Regs::FramebufferTop>) == 0x40, "Structure size and register block length don't match");
+        // Distance between two pixel rows, in bytes
+        u32 stride;
 
-template<>
-struct Regs::Struct<Regs::DisplayTransfer> {
-    using Format = Regs::FramebufferFormat;
+        u32 address_right1;
+        u32 address_right2;
 
-    u32 input_address;
-    u32 output_address;
+        INSERT_PADDING_WORDS(0x30);
+    } framebuffer_config[2];
+    ASSERT_MEMBER_SIZE(framebuffer_config[0], 0x100);
 
-    inline u32 GetPhysicalInputAddress() const {
-        return input_address * 8;
-    }
+    INSERT_PADDING_WORDS(0x169);
 
-    inline u32 GetPhysicalOutputAddress() const {
-        return output_address * 8;
-    }
+    struct {
+        using Format = Regs::FramebufferFormat;
 
-    union {
-        u32 output_size;
+        u32 input_address;
+        u32 output_address;
 
-        BitField< 0, 16, u32> output_width;
-        BitField<16, 16, u32> output_height;
-    };
+        inline u32 GetPhysicalInputAddress() const {
+            return DecodeAddressRegister(input_address);
+        }
 
-    union {
-        u32 input_size;
+        inline u32 GetPhysicalOutputAddress() const {
+            return DecodeAddressRegister(output_address);
+        }
 
-        BitField< 0, 16, u32> input_width;
-        BitField<16, 16, u32> input_height;
-    };
+        union {
+            u32 output_size;
 
-    union {
-        u32 flags;
+            BitField< 0, 16, u32> output_width;
+            BitField<16, 16, u32> output_height;
+        };
 
-        BitField< 0, 1, u32> flip_data;        // flips input data horizontally (TODO) if true
-        BitField< 8, 3, Format> input_format;
-        BitField<12, 3, Format> output_format;
-        BitField<16, 1, u32> output_tiled;     // stores output in a tiled format
-    };
+        union {
+            u32 input_size;
 
-    u32 unknown;
+            BitField< 0, 16, u32> input_width;
+            BitField<16, 16, u32> input_height;
+        };
 
-    // it seems that writing to this field triggers the display transfer
-    u32 trigger;
-};
-static_assert(sizeof(Regs::Struct<Regs::DisplayTransfer>) == 0x1C, "Structure size and register block length don't match");
+        union {
+            u32 flags;
 
-template<>
-struct Regs::Struct<Regs::CommandProcessor> {
-    // command list size
-    u32 size;
+            BitField< 0, 1, u32> flip_data;        // flips input data horizontally (TODO) if true
+            BitField< 8, 3, Format> input_format;
+            BitField<12, 3, Format> output_format;
+            BitField<16, 1, u32> output_tiled;     // stores output in a tiled format
+        };
 
-    u32 pad0;
+        INSERT_PADDING_WORDS(0x1);
 
-    // command list address
-    u32 address;
+        // it seems that writing to this field triggers the display transfer
+        u32 trigger;
+    } display_transfer_config;
+    ASSERT_MEMBER_SIZE(display_transfer_config, 0x1c);
 
-    u32 pad1;
+    INSERT_PADDING_WORDS(0x331);
 
-    // it seems that writing to this field triggers command list processing
-    u32 trigger;
-};
-static_assert(sizeof(Regs::Struct<Regs::CommandProcessor>) == 0x14, "Structure size and register block length don't match");
-
-
-extern RegisterSet<u32, Regs> g_regs;
-
-enum {
-    TOP_ASPECT_X        = 0x5,
-    TOP_ASPECT_Y        = 0x3,
-
-    TOP_HEIGHT          = 240,
-    TOP_WIDTH           = 400,
-    BOTTOM_WIDTH        = 320,
-
-    // Physical addresses in FCRAM (chosen arbitrarily)
-    PADDR_TOP_LEFT_FRAME1       = 0x201D4C00,
-    PADDR_TOP_LEFT_FRAME2       = 0x202D4C00,
-    PADDR_TOP_RIGHT_FRAME1      = 0x203D4C00,
-    PADDR_TOP_RIGHT_FRAME2      = 0x204D4C00,
-    PADDR_SUB_FRAME1            = 0x205D4C00,
-    PADDR_SUB_FRAME2            = 0x206D4C00,
-    // Physical addresses in FCRAM used by ARM9 applications
-/*    PADDR_TOP_LEFT_FRAME1       = 0x20184E60,
-    PADDR_TOP_LEFT_FRAME2       = 0x201CB370,
-    PADDR_TOP_RIGHT_FRAME1      = 0x20282160,
-    PADDR_TOP_RIGHT_FRAME2      = 0x202C8670,
-    PADDR_SUB_FRAME1            = 0x202118E0,
-    PADDR_SUB_FRAME2            = 0x20249CF0,*/
-
-    // Physical addresses in VRAM
-    // TODO: These should just be deduced from the ones above
-    PADDR_VRAM_TOP_LEFT_FRAME1  = 0x181D4C00,
-    PADDR_VRAM_TOP_LEFT_FRAME2  = 0x182D4C00,
-    PADDR_VRAM_TOP_RIGHT_FRAME1 = 0x183D4C00,
-    PADDR_VRAM_TOP_RIGHT_FRAME2 = 0x184D4C00,
-    PADDR_VRAM_SUB_FRAME1       = 0x185D4C00,
-    PADDR_VRAM_SUB_FRAME2       = 0x186D4C00,
-    // Physical addresses in VRAM used by ARM9 applications
-/*    PADDR_VRAM_TOP_LEFT_FRAME2  = 0x181CB370,
-    PADDR_VRAM_TOP_RIGHT_FRAME1 = 0x18282160,
-    PADDR_VRAM_TOP_RIGHT_FRAME2 = 0x182C8670,
-    PADDR_VRAM_SUB_FRAME1       = 0x182118E0,
-    PADDR_VRAM_SUB_FRAME2       = 0x18249CF0,*/
-};
+    struct {
+        // command list size
+        u32 size;
 
-/// Framebuffer location
-enum FramebufferLocation {
-    FRAMEBUFFER_LOCATION_UNKNOWN,   ///< Framebuffer location is unknown
-    FRAMEBUFFER_LOCATION_FCRAM,     ///< Framebuffer is in the GSP heap
-    FRAMEBUFFER_LOCATION_VRAM,      ///< Framebuffer is in VRAM
-};
+        INSERT_PADDING_WORDS(0x1);
+
+        // command list address
+        u32 address;
+
+        INSERT_PADDING_WORDS(0x1);
+
+        // it seems that writing to this field triggers command list processing
+        u32 trigger;
+
+        inline u32 GetPhysicalAddress() const {
+            return DecodeAddressRegister(address);
+        }
+    } command_processor_config;
+    ASSERT_MEMBER_SIZE(command_processor_config, 0x14);
 
-/**
- * Sets whether the framebuffers are in the GSP heap (FCRAM) or VRAM
- * @param
- */
-void SetFramebufferLocation(const FramebufferLocation mode);
-
-/**
- * Gets a read-only pointer to a framebuffer in memory
- * @param address Physical address of framebuffer
- * @return Returns const pointer to raw framebuffer
- */
-const u8* GetFramebufferPointer(const u32 address);
-
-u32 GetFramebufferAddr(const u32 address);
-
-/**
- * Gets the location of the framebuffers
- */
-FramebufferLocation GetFramebufferLocation(u32 address);
+    INSERT_PADDING_WORDS(0x9c3);
+
+#undef INSERT_PADDING_WORDS_HELPER1
+#undef INSERT_PADDING_WORDS_HELPER2
+#undef INSERT_PADDING_WORDS
+
+    static inline int NumIds() {
+        return sizeof(Regs) / sizeof(u32);
+    }
+
+    u32& operator [] (int index) const {
+        u32* content = (u32*)this;
+        return content[index];
+    }
+
+    u32& operator [] (int index) {
+        u32* content = (u32*)this;
+        return content[index];
+    }
+
+private:
+    /*
+     * Most physical addresses which GPU registers refer to are 8-byte aligned.
+     * This function should be used to get the address from a raw register value.
+     */
+    static inline u32 DecodeAddressRegister(u32 register_value) {
+        return register_value * 8;
+    }
+};
+static_assert(std::is_standard_layout<Regs>::value, "Structure does not use standard layout");
+
+// TODO: MSVC does not support using offsetof() on non-static data members even though this
+//       is technically allowed since C++11. This macro should be enabled once MSVC adds
+//       support for that.
+#ifndef _MSC_VER
+#define ASSERT_REG_POSITION(field_name, position)             \
+    static_assert(offsetof(Regs, field_name) == position * 4, \
+                  "Field "#field_name" has invalid position")
+
+ASSERT_REG_POSITION(memory_fill_config[0],    0x00004);
+ASSERT_REG_POSITION(memory_fill_config[1],    0x00008);
+ASSERT_REG_POSITION(framebuffer_config[0],    0x00117);
+ASSERT_REG_POSITION(framebuffer_config[1],    0x00157);
+ASSERT_REG_POSITION(display_transfer_config,  0x00300);
+ASSERT_REG_POSITION(command_processor_config, 0x00638);
+
+#undef ASSERT_REG_POSITION
+#endif // !defined(_MSC_VER)
+
+// The total number of registers is chosen arbitrarily, but let's make sure it's not some odd value anyway.
+static_assert(sizeof(Regs) == 0x1000 * sizeof(u32), "Invalid total size of register set");
+
+extern Regs g_regs;
 
 template <typename T>
 void Read(T &var, const u32 addr);
diff --git a/src/core/mem_map.cpp b/src/core/mem_map.cpp
index c45746be9..14fc01471 100644
--- a/src/core/mem_map.cpp
+++ b/src/core/mem_map.cpp
@@ -72,14 +72,14 @@ void Init() {
 
     g_base = MemoryMap_Setup(g_views, kNumMemViews, flags, &g_arena);
 
-    NOTICE_LOG(MEMMAP, "initialized OK, RAM at %p (mirror at 0 @ %p)", g_heap, 
+    NOTICE_LOG(MEMMAP, "initialized OK, RAM at %p (mirror at 0 @ %p)", g_heap,
         g_physical_fcram);
 }
 
 void Shutdown() {
     u32 flags = 0;
     MemoryMap_Shutdown(g_views, kNumMemViews, flags, &g_arena);
-    
+
     g_arena.ReleaseSpace();
     g_base = NULL;
 
diff --git a/src/core/mem_map.h b/src/core/mem_map.h
index 12941f558..3c7810573 100644
--- a/src/core/mem_map.h
+++ b/src/core/mem_map.h
@@ -14,7 +14,6 @@ namespace Memory {
 enum {
     BOOTROM_SIZE            = 0x00010000,   ///< Bootrom (super secret code/data @ 0x8000) size
     MPCORE_PRIV_SIZE        = 0x00002000,   ///< MPCore private memory region size
-    VRAM_SIZE               = 0x00600000,   ///< VRAM size
     DSP_SIZE                = 0x00080000,   ///< DSP memory size
     AXI_WRAM_SIZE           = 0x00080000,   ///< AXI WRAM size
 
@@ -23,8 +22,6 @@ enum {
     FCRAM_PADDR_END         = (FCRAM_PADDR + FCRAM_SIZE),       ///< FCRAM end of physical space
     FCRAM_VADDR             = 0x08000000,                       ///< FCRAM virtual address
     FCRAM_VADDR_END         = (FCRAM_VADDR + FCRAM_SIZE),       ///< FCRAM end of virtual space
-    FCRAM_VADDR_FW0B        = 0xF0000000,                       ///< FCRAM adress for firmare FW0B
-    FCRAM_VADDR_FW0B_END    = (FCRAM_VADDR_FW0B + FCRAM_SIZE),  ///< FCRAM adress end for FW0B
     FCRAM_MASK              = (FCRAM_SIZE - 1),                 ///< FCRAM mask
 
     SHARED_MEMORY_SIZE      = 0x04000000,   ///< Shared memory size
@@ -73,6 +70,7 @@ enum {
     HARDWARE_IO_PADDR_END   = (HARDWARE_IO_PADDR + HARDWARE_IO_SIZE),
     HARDWARE_IO_VADDR_END   = (HARDWARE_IO_VADDR + HARDWARE_IO_SIZE),
 
+    VRAM_SIZE               = 0x00600000,
     VRAM_PADDR              = 0x18000000,
     VRAM_VADDR              = 0x1F000000,
     VRAM_PADDR_END          = (VRAM_PADDR + VRAM_SIZE),
@@ -112,7 +110,7 @@ struct MemoryBlock {
 
 // In 64-bit, this might point to "high memory" (above the 32-bit limit),
 // so be sure to load it into a 64-bit register.
-extern u8 *g_base; 
+extern u8 *g_base;
 
 // These are guaranteed to point to "low memory" addresses (sub-32-bit).
 // 64-bit: Pointers to low-mem (sub-0x10000000) mirror
@@ -147,7 +145,7 @@ void Write32(const u32 addr, const u32 data);
 
 void WriteBlock(const u32 addr, const u8* data, const int size);
 
-u8* GetPointer(const u32 Address);
+u8* GetPointer(const u32 virtual_address);
 
 /**
  * Maps a block of memory on the heap
@@ -169,16 +167,10 @@ inline const char* GetCharPointer(const u32 address) {
     return (const char *)GetPointer(address);
 }
 
-inline const u32 VirtualAddressFromPhysical_FCRAM(const u32 address) {
-    return ((address & FCRAM_MASK) | FCRAM_VADDR);
-}
-
-inline const u32 VirtualAddressFromPhysical_IO(const u32 address) {
-    return (address + 0x0EB00000);
-}
+/// Converts a physical address to virtual address
+u32 PhysicalToVirtualAddress(const u32 addr);
 
-inline const u32 VirtualAddressFromPhysical_VRAM(const u32 address) {
-    return (address + 0x07000000);
-}
+/// Converts a virtual address to physical address
+u32 VirtualToPhysicalAddress(const u32 addr);
 
 } // namespace
diff --git a/src/core/mem_map_funcs.cpp b/src/core/mem_map_funcs.cpp
index 305be8468..5772cca52 100644
--- a/src/core/mem_map_funcs.cpp
+++ b/src/core/mem_map_funcs.cpp
@@ -17,37 +17,44 @@ std::map<u32, MemoryBlock> g_heap_map;
 std::map<u32, MemoryBlock> g_heap_gsp_map;
 std::map<u32, MemoryBlock> g_shared_map;
 
-/// Convert a physical address (or firmware-specific virtual address) to primary virtual address
-u32 _VirtualAddress(const u32 addr) {
-    // Our memory interface read/write functions assume virtual addresses. Put any physical address 
-    // to virtual address translations here. This is obviously quite hacky... But we're not doing 
-    // any MMU emulation yet or anything
-    if ((addr >= FCRAM_PADDR) && (addr < FCRAM_PADDR_END)) {
-        return VirtualAddressFromPhysical_FCRAM(addr);
-
-    // Virtual address mapping FW0B
-    } else if ((addr >= FCRAM_VADDR_FW0B) && (addr < FCRAM_VADDR_FW0B_END)) {
-        return VirtualAddressFromPhysical_FCRAM(addr);
-
-    // Hardware IO
-    // TODO(bunnei): FixMe
-    // This isn't going to work... The physical address of HARDWARE_IO conflicts with the virtual 
-    // address of shared memory.
-    //} else if ((addr >= HARDWARE_IO_PADDR) && (addr < HARDWARE_IO_PADDR_END)) {
-    //    return (addr + 0x0EB00000);
+/// Convert a physical address to virtual address
+u32 PhysicalToVirtualAddress(const u32 addr) {
+    // Our memory interface read/write functions assume virtual addresses. Put any physical address
+    // to virtual address translations here. This is quite hacky, but necessary until we implement
+    // proper MMU emulation.
+    // TODO: Screw it, I'll let bunnei figure out how to do this properly.
+    if ((addr >= VRAM_PADDR) && (addr < VRAM_PADDR_END)) {
+        return addr - VRAM_PADDR + VRAM_VADDR;
+    }else if ((addr >= FCRAM_PADDR) && (addr < FCRAM_PADDR_END)) {
+        return addr - FCRAM_PADDR + FCRAM_VADDR;
+    }
+
+    ERROR_LOG(MEMMAP, "Unknown physical address @ 0x%08x", addr);
+    return addr;
+}
 
+/// Convert a physical address to virtual address
+u32 VirtualToPhysicalAddress(const u32 addr) {
+    // Our memory interface read/write functions assume virtual addresses. Put any physical address
+    // to virtual address translations here. This is quite hacky, but necessary until we implement
+    // proper MMU emulation.
+    // TODO: Screw it, I'll let bunnei figure out how to do this properly.
+    if ((addr >= VRAM_VADDR) && (addr < VRAM_VADDR_END)) {
+        return addr - 0x07000000;
+    } else if ((addr >= FCRAM_VADDR) && (addr < FCRAM_VADDR_END)) {
+        return addr - FCRAM_VADDR + FCRAM_PADDR;
     }
+
+    ERROR_LOG(MEMMAP, "Unknown virtual address @ 0x%08x", addr);
     return addr;
 }
 
 template <typename T>
-inline void Read(T &var, const u32 addr) {
+inline void Read(T &var, const u32 vaddr) {
     // TODO: Figure out the fastest order of tests for both read and write (they are probably different).
     // TODO: Make sure this represents the mirrors in a correct way.
     // Could just do a base-relative read, too.... TODO
 
-    const u32 vaddr = _VirtualAddress(addr);
-
     // Kernel memory command buffer
     if (vaddr >= KERNEL_MEMORY_VADDR && vaddr < KERNEL_MEMORY_VADDR_END) {
         var = *((const T*)&g_kernel_mem[vaddr & KERNEL_MEMORY_MASK]);
@@ -91,9 +98,8 @@ inline void Read(T &var, const u32 addr) {
 }
 
 template <typename T>
-inline void Write(u32 addr, const T data) {
-    u32 vaddr = _VirtualAddress(addr);
-    
+inline void Write(u32 vaddr, const T data) {
+
     // Kernel memory command buffer
     if (vaddr >= KERNEL_MEMORY_VADDR && vaddr < KERNEL_MEMORY_VADDR_END) {
         *(T*)&g_kernel_mem[vaddr & KERNEL_MEMORY_MASK] = data;
@@ -133,16 +139,14 @@ inline void Write(u32 addr, const T data) {
     //    _assert_msg_(MEMMAP, false, "umimplemented write to Configuration Memory");
     //} else if ((vaddr & 0xFFFFF000) == 0x1FF81000) {
     //    _assert_msg_(MEMMAP, false, "umimplemented write to shared page");
-    
+
     // Error out...
     } else {
         ERROR_LOG(MEMMAP, "unknown Write%d 0x%08X @ 0x%08X", sizeof(data) * 8, data, vaddr);
     }
 }
 
-u8 *GetPointer(const u32 addr) {
-    const u32 vaddr = _VirtualAddress(addr);
-
+u8 *GetPointer(const u32 vaddr) {
     // Kernel memory command buffer
     if (vaddr >= KERNEL_MEMORY_VADDR && vaddr < KERNEL_MEMORY_VADDR_END) {
         return g_kernel_mem + (vaddr & KERNEL_MEMORY_MASK);
@@ -185,12 +189,12 @@ u8 *GetPointer(const u32 addr) {
  */
 u32 MapBlock_Heap(u32 size, u32 operation, u32 permissions) {
     MemoryBlock block;
-    
+
     block.base_address  = HEAP_VADDR;
     block.size          = size;
     block.operation     = operation;
     block.permissions   = permissions;
-    
+
     if (g_heap_map.size() > 0) {
         const MemoryBlock last_block = g_heap_map.rbegin()->second;
         block.address = last_block.address + last_block.size;
@@ -208,12 +212,12 @@ u32 MapBlock_Heap(u32 size, u32 operation, u32 permissions) {
  */
 u32 MapBlock_HeapGSP(u32 size, u32 operation, u32 permissions) {
     MemoryBlock block;
-    
+
     block.base_address  = HEAP_GSP_VADDR;
     block.size          = size;
     block.operation     = operation;
     block.permissions   = permissions;
-    
+
     if (g_heap_gsp_map.size() > 0) {
         const MemoryBlock last_block = g_heap_gsp_map.rbegin()->second;
         block.address = last_block.address + last_block.size;
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index e43e6e1bb..8e7b93acb 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,10 +1,22 @@
-set(SRCS    video_core.cpp
+set(SRCS    clipper.cpp
+            command_processor.cpp
+            primitive_assembly.cpp
+            rasterizer.cpp
             utils.cpp
+            vertex_shader.cpp
+            video_core.cpp
             renderer_opengl/renderer_opengl.cpp)
 
-set(HEADERS video_core.h
+set(HEADERS clipper.h
+            command_processor.h
+            math.h
+            primitive_assembly.h
+            rasterizer.h
             utils.h
+            video_core.h
             renderer_base.h
+            vertex_shader.h
+            video_core.h
             renderer_opengl/renderer_opengl.h)
 
 add_library(video_core STATIC ${SRCS} ${HEADERS})
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
new file mode 100644
index 000000000..b7180328c
--- /dev/null
+++ b/src/video_core/clipper.cpp
@@ -0,0 +1,179 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include <vector>
+
+#include "clipper.h"
+#include "pica.h"
+#include "rasterizer.h"
+#include "vertex_shader.h"
+
+namespace Pica {
+
+namespace Clipper {
+
+struct ClippingEdge {
+public:
+    enum Type {
+        POS_X = 0,
+        NEG_X = 1,
+        POS_Y = 2,
+        NEG_Y = 3,
+        POS_Z = 4,
+        NEG_Z = 5,
+    };
+
+    ClippingEdge(Type type, float24 position) : type(type), pos(position) {}
+
+    bool IsInside(const OutputVertex& vertex) const {
+        switch (type) {
+        case POS_X: return vertex.pos.x <= pos * vertex.pos.w;
+        case NEG_X: return vertex.pos.x >= pos * vertex.pos.w;
+        case POS_Y: return vertex.pos.y <= pos * vertex.pos.w;
+        case NEG_Y: return vertex.pos.y >= pos * vertex.pos.w;
+
+        // TODO: Check z compares ... should be 0..1 instead?
+        case POS_Z: return vertex.pos.z <= pos * vertex.pos.w;
+
+        default:
+        case NEG_Z: return vertex.pos.z >= pos * vertex.pos.w;
+        }
+    }
+
+    bool IsOutSide(const OutputVertex& vertex) const {
+        return !IsInside(vertex);
+    }
+
+    OutputVertex GetIntersection(const OutputVertex& v0, const OutputVertex& v1) const {
+        auto dotpr = [this](const OutputVertex& vtx) {
+            switch (type) {
+            case POS_X: return vtx.pos.x - vtx.pos.w;
+            case NEG_X: return -vtx.pos.x - vtx.pos.w;
+            case POS_Y: return vtx.pos.y - vtx.pos.w;
+            case NEG_Y: return -vtx.pos.y - vtx.pos.w;
+
+            // TODO: Verify z clipping
+            case POS_Z: return vtx.pos.z - vtx.pos.w;
+
+            default:
+            case NEG_Z: return -vtx.pos.w;
+            }
+        };
+
+        float24 dp = dotpr(v0);
+        float24 dp_prev = dotpr(v1);
+        float24 factor = dp_prev / (dp_prev - dp);
+
+        return OutputVertex::Lerp(factor, v0, v1);
+    }
+
+private:
+    Type type;
+    float24 pos;
+};
+
+static void InitScreenCoordinates(OutputVertex& vtx)
+{
+    struct {
+        float24 halfsize_x;
+        float24 offset_x;
+        float24 halfsize_y;
+        float24 offset_y;
+        float24 zscale;
+        float24 offset_z;
+    } viewport;
+
+    viewport.halfsize_x = float24::FromRawFloat24(registers.viewport_size_x);
+    viewport.halfsize_y = float24::FromRawFloat24(registers.viewport_size_y);
+    viewport.offset_x   = float24::FromFloat32(registers.viewport_corner.x);
+    viewport.offset_y   = float24::FromFloat32(registers.viewport_corner.y);
+    viewport.zscale     = float24::FromRawFloat24(registers.viewport_depth_range);
+    viewport.offset_z   = float24::FromRawFloat24(registers.viewport_depth_far_plane);
+
+    // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
+    vtx.screenpos[0] = (vtx.pos.x / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_x / float24::FromFloat32(2.0) + viewport.offset_x;
+    vtx.screenpos[1] = (vtx.pos.y / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
+    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z / vtx.pos.w * viewport.zscale;
+}
+
+void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
+
+    // TODO (neobrain):
+    // The list of output vertices has some fixed maximum size,
+    // however I haven't taken the time to figure out what it is exactly.
+    // For now, we hence just assume a maximal size of 1000 vertices.
+    const size_t max_vertices = 1000;
+    std::vector<OutputVertex> buffer_vertices;
+    std::vector<OutputVertex*> output_list{ &v0, &v1, &v2 };
+
+    // Make sure to reserve space for all vertices.
+    // Without this, buffer reallocation would invalidate references.
+    buffer_vertices.reserve(max_vertices);
+
+    // Simple implementation of the Sutherland-Hodgman clipping algorithm.
+    // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
+    for (auto edge : { ClippingEdge(ClippingEdge::POS_X, float24::FromFloat32(+1.0)),
+                       ClippingEdge(ClippingEdge::NEG_X, float24::FromFloat32(-1.0)),
+                       ClippingEdge(ClippingEdge::POS_Y, float24::FromFloat32(+1.0)),
+                       ClippingEdge(ClippingEdge::NEG_Y, float24::FromFloat32(-1.0)),
+                       ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
+                       ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
+
+        const std::vector<OutputVertex*> input_list = output_list;
+        output_list.clear();
+
+        const OutputVertex* reference_vertex = input_list.back();
+
+        for (const auto& vertex : input_list) {
+            // NOTE: This algorithm changes vertex order in some cases!
+            if (edge.IsInside(*vertex)) {
+                if (edge.IsOutSide(*reference_vertex)) {
+                    buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
+                    output_list.push_back(&(buffer_vertices.back()));
+                }
+
+                output_list.push_back(vertex);
+            } else if (edge.IsInside(*reference_vertex)) {
+                buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
+                output_list.push_back(&(buffer_vertices.back()));
+            }
+
+            reference_vertex = vertex;
+        }
+
+        // Need to have at least a full triangle to continue...
+        if (output_list.size() < 3)
+            return;
+    }
+
+    InitScreenCoordinates(*(output_list[0]));
+    InitScreenCoordinates(*(output_list[1]));
+
+    for (int i = 0; i < output_list.size() - 2; i ++) {
+        OutputVertex& vtx0 = *(output_list[0]);
+        OutputVertex& vtx1 = *(output_list[i+1]);
+        OutputVertex& vtx2 = *(output_list[i+2]);
+
+        InitScreenCoordinates(vtx2);
+
+        DEBUG_LOG(GPU,
+                  "Triangle %d/%d (%d buffer vertices) at position (%.3f, %.3f, %.3f, %.3f), "
+                  "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "
+                  "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",
+                  i,output_list.size(), buffer_vertices.size(),
+                  vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),output_list.size(),
+                  vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
+                  vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),
+                  vtx0.screenpos.x.ToFloat32(), vtx0.screenpos.y.ToFloat32(), vtx0.screenpos.z.ToFloat32(),
+                  vtx1.screenpos.x.ToFloat32(), vtx1.screenpos.y.ToFloat32(), vtx1.screenpos.z.ToFloat32(),
+                  vtx2.screenpos.x.ToFloat32(), vtx2.screenpos.y.ToFloat32(), vtx2.screenpos.z.ToFloat32());
+
+        Rasterizer::ProcessTriangle(vtx0, vtx1, vtx2);
+    }
+}
+
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/clipper.h b/src/video_core/clipper.h
new file mode 100644
index 000000000..14d31ca1e
--- /dev/null
+++ b/src/video_core/clipper.h
@@ -0,0 +1,21 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#pragma once
+
+namespace Pica {
+
+namespace VertexShader {
+    struct OutputVertex;
+}
+
+namespace Clipper {
+
+using VertexShader::OutputVertex;
+
+void ProcessTriangle(OutputVertex& v0, OutputVertex& v1, OutputVertex& v2);
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
new file mode 100644
index 000000000..020a4da3f
--- /dev/null
+++ b/src/video_core/command_processor.cpp
@@ -0,0 +1,238 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include "command_processor.h"
+#include "math.h"
+#include "pica.h"
+#include "primitive_assembly.h"
+#include "vertex_shader.h"
+
+
+namespace Pica {
+
+Regs registers;
+
+namespace CommandProcessor {
+
+static int float_regs_counter = 0;
+
+static u32 uniform_write_buffer[4];
+
+// Used for VSLoadProgramData and VSLoadSwizzleData
+static u32 vs_binary_write_offset = 0;
+static u32 vs_swizzle_write_offset = 0;
+
+static inline void WritePicaReg(u32 id, u32 value) {
+    u32 old_value = registers[id];
+    registers[id] = value;
+
+    switch(id) {
+        // It seems like these trigger vertex rendering
+        case PICA_REG_INDEX(trigger_draw):
+        case PICA_REG_INDEX(trigger_draw_indexed):
+        {
+            const auto& attribute_config = registers.vertex_attributes;
+            const u8* const base_address = Memory::GetPointer(attribute_config.GetBaseAddress());
+
+            // Information about internal vertex attributes
+            const u8* vertex_attribute_sources[16];
+            u32 vertex_attribute_strides[16];
+            u32 vertex_attribute_formats[16];
+            u32 vertex_attribute_elements[16];
+            u32 vertex_attribute_element_size[16];
+
+            // Setup attribute data from loaders
+            for (int loader = 0; loader < 12; ++loader) {
+                const auto& loader_config = attribute_config.attribute_loaders[loader];
+
+                const u8* load_address = base_address + loader_config.data_offset;
+
+                // TODO: What happens if a loader overwrites a previous one's data?
+                for (int component = 0; component < loader_config.component_count; ++component) {
+                    u32 attribute_index = loader_config.GetComponent(component);
+                    vertex_attribute_sources[attribute_index] = load_address;
+                    vertex_attribute_strides[attribute_index] = loader_config.byte_count;
+                    vertex_attribute_formats[attribute_index] = (u32)attribute_config.GetFormat(attribute_index);
+                    vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index);
+                    vertex_attribute_element_size[attribute_index] = attribute_config.GetElementSizeInBytes(attribute_index);
+                    load_address += attribute_config.GetStride(attribute_index);
+                }
+            }
+
+            // Load vertices
+            bool is_indexed = (id == PICA_REG_INDEX(trigger_draw_indexed));
+
+            const auto& index_info = registers.index_array;
+            const u8* index_address_8 = (u8*)base_address + index_info.offset;
+            const u16* index_address_16 = (u16*)index_address_8;
+            bool index_u16 = (bool)index_info.format;
+
+            for (int index = 0; index < registers.num_vertices; ++index)
+            {
+                int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index;
+
+                if (is_indexed) {
+                    // TODO: Implement some sort of vertex cache!
+                }
+
+                // Initialize data for the current vertex
+                VertexShader::InputVertex input;
+
+                for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) {
+                    for (int comp = 0; comp < vertex_attribute_elements[i]; ++comp) {
+                        const u8* srcdata = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i];
+                        const float srcval = (vertex_attribute_formats[i] == 0) ? *(s8*)srcdata :
+                                             (vertex_attribute_formats[i] == 1) ? *(u8*)srcdata :
+                                             (vertex_attribute_formats[i] == 2) ? *(s16*)srcdata :
+                                                                                  *(float*)srcdata;
+                        input.attr[i][comp] = float24::FromFloat32(srcval);
+                        DEBUG_LOG(GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08x + 0x%04x: %f",
+                                  comp, i, vertex, index,
+                                  attribute_config.GetBaseAddress(),
+                                  vertex_attribute_sources[i] - base_address,
+                                  srcdata - vertex_attribute_sources[i],
+                                  input.attr[i][comp].ToFloat32());
+                    }
+                }
+                VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes());
+
+                if (is_indexed) {
+                    // TODO: Add processed vertex to vertex cache!
+                }
+
+                PrimitiveAssembly::SubmitVertex(output);
+            }
+            break;
+        }
+
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[0], 0x2c1):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[1], 0x2c2):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[2], 0x2c3):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[3], 0x2c4):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[4], 0x2c5):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[5], 0x2c6):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[6], 0x2c7):
+        case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[7], 0x2c8):
+        {
+            auto& uniform_setup = registers.vs_uniform_setup;
+
+            // TODO: Does actual hardware indeed keep an intermediate buffer or does
+            //       it directly write the values?
+            uniform_write_buffer[float_regs_counter++] = value;
+
+            // Uniforms are written in a packed format such that 4 float24 values are encoded in
+            // three 32-bit numbers. We write to internal memory once a full such vector is
+            // written.
+            if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) ||
+                (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) {
+                float_regs_counter = 0;
+
+                auto& uniform = VertexShader::GetFloatUniform(uniform_setup.index);
+
+                if (uniform_setup.index > 95) {
+                    ERROR_LOG(GPU, "Invalid VS uniform index %d", (int)uniform_setup.index);
+                    break;
+                }
+
+                // NOTE: The destination component order indeed is "backwards"
+                if (uniform_setup.IsFloat32()) {
+                    for (auto i : {0,1,2,3})
+                        uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i]));
+                } else {
+                    // TODO: Untested
+                    uniform.w = float24::FromRawFloat24(uniform_write_buffer[0] >> 8);
+                    uniform.z = float24::FromRawFloat24(((uniform_write_buffer[0] & 0xFF)<<16) | ((uniform_write_buffer[1] >> 16) & 0xFFFF));
+                    uniform.y = float24::FromRawFloat24(((uniform_write_buffer[1] & 0xFFFF)<<8) | ((uniform_write_buffer[2] >> 24) & 0xFF));
+                    uniform.x = float24::FromRawFloat24(uniform_write_buffer[2] & 0xFFFFFF);
+                }
+
+                DEBUG_LOG(GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index,
+                          uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(),
+                          uniform.w.ToFloat32());
+
+                // TODO: Verify that this actually modifies the register!
+                uniform_setup.index = uniform_setup.index + 1;
+            }
+            break;
+        }
+
+        // Seems to be used to reset the write pointer for VSLoadProgramData
+        case PICA_REG_INDEX(vs_program.begin_load):
+            vs_binary_write_offset = 0;
+            break;
+
+        // Load shader program code
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[0], 0x2cc):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[1], 0x2cd):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[2], 0x2ce):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[3], 0x2cf):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[4], 0x2d0):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[5], 0x2d1):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[6], 0x2d2):
+        case PICA_REG_INDEX_WORKAROUND(vs_program.set_word[7], 0x2d3):
+        {
+            VertexShader::SubmitShaderMemoryChange(vs_binary_write_offset, value);
+            vs_binary_write_offset++;
+            break;
+        }
+
+        // Seems to be used to reset the write pointer for VSLoadSwizzleData
+        case PICA_REG_INDEX(vs_swizzle_patterns.begin_load):
+            vs_swizzle_write_offset = 0;
+            break;
+
+        // Load swizzle pattern data
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[0], 0x2d6):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[1], 0x2d7):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[2], 0x2d8):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[3], 0x2d9):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[4], 0x2da):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[5], 0x2db):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc):
+        case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd):
+        {
+            VertexShader::SubmitSwizzleDataChange(vs_swizzle_write_offset, value);
+            vs_swizzle_write_offset++;
+            break;
+        }
+
+        default:
+            break;
+    }
+}
+
+static std::ptrdiff_t ExecuteCommandBlock(const u32* first_command_word) {
+    const CommandHeader& header = *(const CommandHeader*)(&first_command_word[1]);
+
+    u32* read_pointer = (u32*)first_command_word;
+
+    // TODO: Take parameter mask into consideration!
+
+    WritePicaReg(header.cmd_id, *read_pointer);
+    read_pointer += 2;
+
+    for (int i = 1; i < 1+header.extra_data_length; ++i) {
+        u32 cmd = header.cmd_id + ((header.group_commands) ? i : 0);
+        WritePicaReg(cmd, *read_pointer);
+        ++read_pointer;
+    }
+
+    // align read pointer to 8 bytes
+    if ((first_command_word - read_pointer) % 2)
+        ++read_pointer;
+
+    return read_pointer - first_command_word;
+}
+
+void ProcessCommandList(const u32* list, u32 size) {
+    u32* read_pointer = (u32*)list;
+
+    while (read_pointer < list + size) {
+        read_pointer += ExecuteCommandBlock(read_pointer);
+    }
+}
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/command_processor.h b/src/video_core/command_processor.h
new file mode 100644
index 000000000..6b6241a25
--- /dev/null
+++ b/src/video_core/command_processor.h
@@ -0,0 +1,31 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+#include "pica.h"
+
+namespace Pica {
+
+namespace CommandProcessor {
+
+union CommandHeader {
+    u32 hex;
+
+    BitField< 0, 16, u32> cmd_id;
+    BitField<16,  4, u32> parameter_mask;
+    BitField<20, 11, u32> extra_data_length;
+    BitField<31,  1, u32> group_commands;
+};
+static_assert(std::is_standard_layout<CommandHeader>::value == true, "CommandHeader does not use standard layout");
+static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");
+
+void ProcessCommandList(const u32* list, u32 size);
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/gpu_debugger.h b/src/video_core/gpu_debugger.h
index 5d85f90b9..2ba873457 100644
--- a/src/video_core/gpu_debugger.h
+++ b/src/video_core/gpu_debugger.h
@@ -11,6 +11,8 @@
 #include "common/log.h"
 
 #include "core/hle/service/gsp.h"
+
+#include "command_processor.h"
 #include "pica.h"
 
 class GraphicsDebugger
@@ -20,10 +22,10 @@ public:
     // A vector of commands represented by their raw byte sequence
     struct PicaCommand : public std::vector<u32>
     {
-        const Pica::CommandHeader& GetHeader() const
+        const Pica::CommandProcessor::CommandHeader& GetHeader() const
         {
             const u32& val = at(1);
-            return *(Pica::CommandHeader*)&val;
+            return *(Pica::CommandProcessor::CommandHeader*)&val;
         }
     };
 
@@ -99,7 +101,7 @@ public:
         PicaCommandList cmdlist;
         for (u32* parse_pointer = command_list; parse_pointer < command_list + size_in_words;)
         {
-            const Pica::CommandHeader header = static_cast<Pica::CommandHeader>(parse_pointer[1]);
+            const Pica::CommandProcessor::CommandHeader& header = *(Pica::CommandProcessor::CommandHeader*)(&parse_pointer[1]);
 
             cmdlist.push_back(PicaCommand());
             auto& cmd = cmdlist.back();
diff --git a/src/video_core/math.h b/src/video_core/math.h
new file mode 100644
index 000000000..7030f2cfb
--- /dev/null
+++ b/src/video_core/math.h
@@ -0,0 +1,578 @@
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+
+// Copyright 2014 Tony Wasserka
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the owner nor the names of its contributors may
+//       be used to endorse or promote products derived from this software
+//       without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#pragma once
+
+#include <cmath>
+
+namespace Math {
+
+template<typename T> class Vec2;
+template<typename T> class Vec3;
+template<typename T> class Vec4;
+
+
+template<typename T>
+class Vec2 {
+public:
+    struct {
+        T x,y;
+    };
+
+    T* AsArray() { return &x; }
+
+    Vec2() = default;
+    Vec2(const T a[2]) : x(a[0]), y(a[1]) {}
+    Vec2(const T& _x, const T& _y) : x(_x), y(_y) {}
+
+    template<typename T2>
+    Vec2<T2> Cast() const {
+        return Vec2<T2>((T2)x, (T2)y);
+    }
+
+    static Vec2 AssignToAll(const T& f)
+    {
+        return Vec2<T>(f, f);
+    }
+
+    void Write(T a[2])
+    {
+        a[0] = x; a[1] = y;
+    }
+
+    Vec2 operator +(const Vec2& other) const
+    {
+        return Vec2(x+other.x, y+other.y);
+    }
+    void operator += (const Vec2 &other)
+    {
+        x+=other.x; y+=other.y;
+    }
+    Vec2 operator -(const Vec2& other) const
+    {
+        return Vec2(x-other.x, y-other.y);
+    }
+    void operator -= (const Vec2& other)
+    {
+        x-=other.x; y-=other.y;
+    }
+    Vec2 operator -() const
+    {
+        return Vec2(-x,-y);
+    }
+    Vec2 operator * (const Vec2& other) const
+    {
+        return Vec2(x*other.x, y*other.y);
+    }
+    template<typename V>
+    Vec2 operator * (const V& f) const
+    {
+        return Vec2(x*f,y*f);
+    }
+    template<typename V>
+    void operator *= (const V& f)
+    {
+        x*=f; y*=f;
+    }
+    template<typename V>
+    Vec2 operator / (const V& f) const
+    {
+        return Vec2(x/f,y/f);
+    }
+    template<typename V>
+    void operator /= (const V& f)
+    {
+        *this = *this / f;
+    }
+
+    T Length2() const
+    {
+        return x*x + y*y;
+    }
+
+    // Only implemented for T=float
+    float Length() const;
+    void SetLength(const float l);
+    Vec2 WithLength(const float l) const;
+    float Distance2To(Vec2 &other);
+    Vec2 Normalized() const;
+    float Normalize(); // returns the previous length, which is often useful
+
+    T& operator [] (int i) //allow vector[1] = 3   (vector.y=3)
+    {
+        return *((&x) + i);
+    }
+    T operator [] (const int i) const
+    {
+        return *((&x) + i);
+    }
+
+    void SetZero()
+    {
+        x=0; y=0;
+    }
+
+    // Common aliases: UV (texel coordinates), ST (texture coordinates)
+    T& u() { return x; }
+    T& v() { return y; }
+    T& s() { return x; }
+    T& t() { return y; }
+
+    const T& u() const { return x; }
+    const T& v() const { return y; }
+    const T& s() const { return x; }
+    const T& t() const { return y; }
+
+    // swizzlers - create a subvector of specific components
+    Vec2 yx() const { return Vec2(y, x); }
+    Vec2 vu() const { return Vec2(y, x); }
+    Vec2 ts() const { return Vec2(y, x); }
+
+    // Inserters to add new elements to effectively create larger vectors containing this Vec2
+    Vec3<T> InsertBeforeX(const T& value) {
+        return Vec3<T>(value, x, y);
+    }
+    Vec3<T> InsertBeforeY(const T& value) {
+        return Vec3<T>(x, value, y);
+    }
+    Vec3<T> Append(const T& value) {
+        return Vec3<T>(x, y, value);
+    }
+};
+
+template<typename T, typename V>
+Vec2<T> operator * (const V& f, const Vec2<T>& vec)
+{
+    return Vec2<T>(f*vec.x,f*vec.y);
+}
+
+typedef Vec2<float> Vec2f;
+
+template<typename T>
+class Vec3
+{
+public:
+    struct
+    {
+        T x,y,z;
+    };
+
+    T* AsArray() { return &x; }
+
+    Vec3() = default;
+    Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
+    Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
+
+    template<typename T2>
+    Vec3<T2> Cast() const {
+        return Vec3<T2>((T2)x, (T2)y, (T2)z);
+    }
+
+    // Only implemented for T=int and T=float
+    static Vec3 FromRGB(unsigned int rgb);
+    unsigned int ToRGB() const; // alpha bits set to zero
+
+    static Vec3 AssignToAll(const T& f)
+    {
+        return Vec3<T>(f, f, f);
+    }
+
+    void Write(T a[3])
+    {
+        a[0] = x; a[1] = y; a[2] = z;
+    }
+
+    Vec3 operator +(const Vec3 &other) const
+    {
+        return Vec3(x+other.x, y+other.y, z+other.z);
+    }
+    void operator += (const Vec3 &other)
+    {
+        x+=other.x; y+=other.y; z+=other.z;
+    }
+    Vec3 operator -(const Vec3 &other) const
+    {
+        return Vec3(x-other.x, y-other.y, z-other.z);
+    }
+    void operator -= (const Vec3 &other)
+    {
+        x-=other.x; y-=other.y; z-=other.z;
+    }
+    Vec3 operator -() const
+    {
+        return Vec3(-x,-y,-z);
+    }
+    Vec3 operator * (const Vec3 &other) const
+    {
+        return Vec3(x*other.x, y*other.y, z*other.z);
+    }
+    template<typename V>
+    Vec3 operator * (const V& f) const
+    {
+        return Vec3(x*f,y*f,z*f);
+    }
+    template<typename V>
+    void operator *= (const V& f)
+    {
+        x*=f; y*=f; z*=f;
+    }
+    template<typename V>
+    Vec3 operator / (const V& f) const
+    {
+        return Vec3(x/f,y/f,z/f);
+    }
+    template<typename V>
+    void operator /= (const V& f)
+    {
+        *this = *this / f;
+    }
+
+    T Length2() const
+    {
+        return x*x + y*y + z*z;
+    }
+
+    // Only implemented for T=float
+    float Length() const;
+    void SetLength(const float l);
+    Vec3 WithLength(const float l) const;
+    float Distance2To(Vec3 &other);
+    Vec3 Normalized() const;
+    float Normalize(); // returns the previous length, which is often useful
+
+    T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
+    {
+        return *((&x) + i);
+    }
+    T operator [] (const int i) const
+    {
+        return *((&x) + i);
+    }
+
+    void SetZero()
+    {
+        x=0; y=0; z=0;
+    }
+
+    // Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
+    T& u() { return x; }
+    T& v() { return y; }
+    T& w() { return z; }
+
+    T& r() { return x; }
+    T& g() { return y; }
+    T& b() { return z; }
+
+    T& s() { return x; }
+    T& t() { return y; }
+    T& q() { return z; }
+
+    const T& u() const { return x; }
+    const T& v() const { return y; }
+    const T& w() const { return z; }
+
+    const T& r() const { return x; }
+    const T& g() const { return y; }
+    const T& b() const { return z; }
+
+    const T& s() const { return x; }
+    const T& t() const { return y; }
+    const T& q() const { return z; }
+
+    // swizzlers - create a subvector of specific components
+    // e.g. Vec2 uv() { return Vec2(x,y); }
+    // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
+#define _DEFINE_SWIZZLER2(a, b, name) Vec2<T> name() const { return Vec2<T>(a, b); }
+#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
+    _DEFINE_SWIZZLER2(a, b, a##b); \
+    _DEFINE_SWIZZLER2(a, b, a2##b2); \
+    _DEFINE_SWIZZLER2(a, b, a3##b3); \
+    _DEFINE_SWIZZLER2(a, b, a4##b4); \
+    _DEFINE_SWIZZLER2(b, a, b##a); \
+    _DEFINE_SWIZZLER2(b, a, b2##a2); \
+    _DEFINE_SWIZZLER2(b, a, b3##a3); \
+    _DEFINE_SWIZZLER2(b, a, b4##a4);
+
+    DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
+    DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
+    DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
+#undef DEFINE_SWIZZLER2
+#undef _DEFINE_SWIZZLER2
+
+    // Inserters to add new elements to effectively create larger vectors containing this Vec2
+    Vec4<T> InsertBeforeX(const T& value) {
+        return Vec4<T>(value, x, y, z);
+    }
+    Vec4<T> InsertBeforeY(const T& value) {
+        return Vec4<T>(x, value, y, z);
+    }
+    Vec4<T> InsertBeforeZ(const T& value) {
+        return Vec4<T>(x, y, value, z);
+    }
+    Vec4<T> Append(const T& value) {
+        return Vec4<T>(x, y, z, value);
+    }
+};
+
+template<typename T, typename V>
+Vec3<T> operator * (const V& f, const Vec3<T>& vec)
+{
+    return Vec3<T>(f*vec.x,f*vec.y,f*vec.z);
+}
+
+typedef Vec3<float> Vec3f;
+
+template<typename T>
+class Vec4
+{
+public:
+    struct
+    {
+        T x,y,z,w;
+    };
+
+    T* AsArray() { return &x; }
+
+    Vec4() = default;
+    Vec4(const T a[4]) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {}
+    Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {}
+
+    template<typename T2>
+    Vec4<T2> Cast() const {
+        return Vec4<T2>((T2)x, (T2)y, (T2)z, (T2)w);
+    }
+
+    // Only implemented for T=int and T=float
+    static Vec4 FromRGBA(unsigned int rgba);
+    unsigned int ToRGBA() const;
+
+    static Vec4 AssignToAll(const T& f) {
+        return Vec4<T>(f, f, f, f);
+    }
+
+    void Write(T a[4])
+    {
+        a[0] = x; a[1] = y; a[2] = z; a[3] = w;
+    }
+
+    Vec4 operator +(const Vec4& other) const
+    {
+        return Vec4(x+other.x, y+other.y, z+other.z, w+other.w);
+    }
+    void operator += (const Vec4& other)
+    {
+        x+=other.x; y+=other.y; z+=other.z; w+=other.w;
+    }
+    Vec4 operator -(const Vec4 &other) const
+    {
+        return Vec4(x-other.x, y-other.y, z-other.z, w-other.w);
+    }
+    void operator -= (const Vec4 &other)
+    {
+        x-=other.x; y-=other.y; z-=other.z; w-=other.w;
+    }
+    Vec4 operator -() const
+    {
+        return Vec4(-x,-y,-z,-w);
+    }
+    Vec4 operator * (const Vec4 &other) const
+    {
+        return Vec4(x*other.x, y*other.y, z*other.z, w*other.w);
+    }
+    template<typename V>
+    Vec4 operator * (const V& f) const
+    {
+        return Vec4(x*f,y*f,z*f,w*f);
+    }
+    template<typename V>
+    void operator *= (const V& f)
+    {
+        x*=f; y*=f; z*=f; w*=f;
+    }
+    template<typename V>
+    Vec4 operator / (const V& f) const
+    {
+        return Vec4(x/f,y/f,z/f,w/f);
+    }
+    template<typename V>
+    void operator /= (const V& f)
+    {
+        *this = *this / f;
+    }
+
+    T Length2() const
+    {
+        return x*x + y*y + z*z + w*w;
+    }
+
+    // Only implemented for T=float
+    float Length() const;
+    void SetLength(const float l);
+    Vec4 WithLength(const float l) const;
+    float Distance2To(Vec4 &other);
+    Vec4 Normalized() const;
+    float Normalize(); // returns the previous length, which is often useful
+
+    T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
+    {
+        return *((&x) + i);
+    }
+    T operator [] (const int i) const
+    {
+        return *((&x) + i);
+    }
+
+    void SetZero()
+    {
+        x=0; y=0; z=0;
+    }
+
+    // Common alias: RGBA (colors)
+    T& r() { return x; }
+    T& g() { return y; }
+    T& b() { return z; }
+    T& a() { return w; }
+
+    const T& r() const { return x; }
+    const T& g() const { return y; }
+    const T& b() const { return z; }
+    const T& a() const { return w; }
+
+    // swizzlers - create a subvector of specific components
+    // e.g. Vec2 uv() { return Vec2(x,y); }
+    // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
+#define _DEFINE_SWIZZLER2(a, b, name) Vec2<T> name() const { return Vec2<T>(a, b); }
+#define DEFINE_SWIZZLER2(a, b, a2, b2) \
+    _DEFINE_SWIZZLER2(a, b, a##b); \
+    _DEFINE_SWIZZLER2(a, b, a2##b2); \
+    _DEFINE_SWIZZLER2(b, a, b##a); \
+    _DEFINE_SWIZZLER2(b, a, b2##a2);
+
+    DEFINE_SWIZZLER2(x, y, r, g);
+    DEFINE_SWIZZLER2(x, z, r, b);
+    DEFINE_SWIZZLER2(x, w, r, a);
+    DEFINE_SWIZZLER2(y, z, g, b);
+    DEFINE_SWIZZLER2(y, w, g, a);
+    DEFINE_SWIZZLER2(z, w, b, a);
+#undef DEFINE_SWIZZLER2
+#undef _DEFINE_SWIZZLER2
+
+#define _DEFINE_SWIZZLER3(a, b, c, name) Vec3<T> name() const { return Vec3<T>(a, b, c); }
+#define DEFINE_SWIZZLER3(a, b, c, a2, b2, c2) \
+    _DEFINE_SWIZZLER3(a, b, c, a##b##c); \
+    _DEFINE_SWIZZLER3(a, c, b, a##c##b); \
+    _DEFINE_SWIZZLER3(b, a, c, b##a##c); \
+    _DEFINE_SWIZZLER3(b, c, a, b##c##a); \
+    _DEFINE_SWIZZLER3(c, a, b, c##a##b); \
+    _DEFINE_SWIZZLER3(c, b, a, c##b##a); \
+    _DEFINE_SWIZZLER3(a, b, c, a2##b2##c2); \
+    _DEFINE_SWIZZLER3(a, c, b, a2##c2##b2); \
+    _DEFINE_SWIZZLER3(b, a, c, b2##a2##c2); \
+    _DEFINE_SWIZZLER3(b, c, a, b2##c2##a2); \
+    _DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \
+    _DEFINE_SWIZZLER3(c, b, a, c2##b2##a2);
+
+    DEFINE_SWIZZLER3(x, y, z, r, g, b);
+    DEFINE_SWIZZLER3(x, y, w, r, g, a);
+    DEFINE_SWIZZLER3(x, z, w, r, b, a);
+    DEFINE_SWIZZLER3(y, z, w, g, b, a);
+#undef DEFINE_SWIZZLER3
+#undef _DEFINE_SWIZZLER3
+};
+
+
+template<typename T, typename V>
+Vec4<T> operator * (const V& f, const Vec4<T>& vec)
+{
+    return Vec4<T>(f*vec.x,f*vec.y,f*vec.z,f*vec.w);
+}
+
+typedef Vec4<float> Vec4f;
+
+
+template<typename T>
+static inline T Dot(const Vec2<T>& a, const Vec2<T>& b)
+{
+    return a.x*b.x + a.y*b.y;
+}
+
+template<typename T>
+static inline T Dot(const Vec3<T>& a, const Vec3<T>& b)
+{
+    return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+template<typename T>
+static inline T Dot(const Vec4<T>& a, const Vec4<T>& b)
+{
+    return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
+}
+
+template<typename T>
+static inline Vec3<T> Cross(const Vec3<T>& a, const Vec3<T>& b)
+{
+    return Vec3<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+}
+
+// linear interpolation via float: 0.0=begin, 1.0=end
+template<typename X>
+static inline X Lerp(const X& begin, const X& end, const float t)
+{
+    return begin*(1.f-t) + end*t;
+}
+
+// linear interpolation via int: 0=begin, base=end
+template<typename X, int base>
+static inline X LerpInt(const X& begin, const X& end, const int t)
+{
+    return (begin*(base-t) + end*t) / base;
+}
+
+// Utility vector factories
+template<typename T>
+static inline Vec2<T> MakeVec2(const T& x, const T& y)
+{
+    return Vec2<T>{x, y};
+}
+
+template<typename T>
+static inline Vec3<T> MakeVec3(const T& x, const T& y, const T& z)
+{
+    return Vec3<T>{x, y, z};
+}
+
+template<typename T>
+static inline Vec4<T> MakeVec4(const T& x, const T& y, const T& z, const T& w)
+{
+    return Vec4<T>{x, y, z, w};
+}
+
+} // namespace
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index f0fa3aba9..81af57336 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -4,126 +4,567 @@
 
 #pragma once
 
+#include <cstddef>
 #include <initializer_list>
 #include <map>
 
 #include "common/bit_field.h"
 #include "common/common_types.h"
-#include "common/register_set.h"
+
+#include "core/mem_map.h"
 
 namespace Pica {
 
+// Returns index corresponding to the Regs member labeled by field_name
+// TODO: Due to Visual studio bug 209229, offsetof does not return constant expressions
+//       when used with array elements (e.g. PICA_REG_INDEX(vs_uniform_setup.set_value[1])).
+//       For details cf. https://connect.microsoft.com/VisualStudio/feedback/details/209229/offsetof-does-not-produce-a-constant-expression-for-array-members
+//       Hopefully, this will be fixed sometime in the future.
+//       For lack of better alternatives, we currently hardcode the offsets when constant
+//       expressions are needed via PICA_REG_INDEX_WORKAROUND (on sane compilers, static_asserts
+//       will then make sure the offsets indeed match the automatically calculated ones).
+#define PICA_REG_INDEX(field_name) (offsetof(Pica::Regs, field_name) / sizeof(u32))
+#if defined(_MSC_VER)
+#define PICA_REG_INDEX_WORKAROUND(field_name, backup_workaround_index) (backup_workaround_index)
+#else
+// NOTE: Yeah, hacking in a static_assert here just to workaround the lacking MSVC compiler
+//       really is this annoying. This macro just forwards its first argument to PICA_REG_INDEX
+//       and then performs a (no-op) cast to size_t iff the second argument matches the expected
+//       field offset. Otherwise, the compiler will fail to compile this code.
+#define PICA_REG_INDEX_WORKAROUND(field_name, backup_workaround_index) \
+    ((typename std::enable_if<backup_workaround_index == PICA_REG_INDEX(field_name), size_t>::type)PICA_REG_INDEX(field_name))
+#endif // _MSC_VER
+
 struct Regs {
-    enum Id : u32 {
-        ViewportSizeX              =  0x41,
-        ViewportInvSizeX           =  0x42,
-        ViewportSizeY              =  0x43,
-        ViewportInvSizeY           =  0x44,
-        ViewportCorner             =  0x68,
-        DepthBufferFormat          = 0x116,
-        ColorBufferFormat          = 0x117,
-        DepthBufferAddress         = 0x11C,
-        ColorBufferAddress         = 0x11D,
-        ColorBufferSize            = 0x11E,
-
-        VertexArrayBaseAddr        = 0x200,
-        VertexDescriptor           = 0x201, // 0x202
-        VertexAttributeOffset      = 0x203, // 0x206,0x209,0x20C,0x20F,0x212,0x215,0x218,0x21B,0x21E,0x221,0x224
-        VertexAttributeInfo0       = 0x204, // 0x207,0x20A,0x20D,0x210,0x213,0x216,0x219,0x21C,0x21F,0x222,0x225
-        VertexAttributeInfo1       = 0x205, // 0x208,0x20B,0x20E,0x211,0x214,0x217,0x21A,0x21D,0x220,0x223,0x226
-
-        NumIds                     = 0x300,
+
+// helper macro to properly align structure members.
+// Calling INSERT_PADDING_WORDS will add a new member variable with a name like "pad121",
+// depending on the current source line to make sure variable names are unique.
+#define INSERT_PADDING_WORDS_HELPER1(x, y) x ## y
+#define INSERT_PADDING_WORDS_HELPER2(x, y) INSERT_PADDING_WORDS_HELPER1(x, y)
+#define INSERT_PADDING_WORDS(num_words) u32 INSERT_PADDING_WORDS_HELPER2(pad, __LINE__)[(num_words)];
+
+    INSERT_PADDING_WORDS(0x41);
+
+    BitField<0, 24, u32> viewport_size_x;
+    INSERT_PADDING_WORDS(0x1);
+    BitField<0, 24, u32> viewport_size_y;
+
+    INSERT_PADDING_WORDS(0x9);
+
+    BitField<0, 24, u32> viewport_depth_range; // float24
+    BitField<0, 24, u32> viewport_depth_far_plane; // float24
+
+    INSERT_PADDING_WORDS(0x1);
+
+    union {
+        // Maps components of output vertex attributes to semantics
+        enum Semantic : u32
+        {
+            POSITION_X   =  0,
+            POSITION_Y   =  1,
+            POSITION_Z   =  2,
+            POSITION_W   =  3,
+
+            COLOR_R      =  8,
+            COLOR_G      =  9,
+            COLOR_B      = 10,
+            COLOR_A      = 11,
+
+            TEXCOORD0_U  = 12,
+            TEXCOORD0_V  = 13,
+            TEXCOORD1_U  = 14,
+            TEXCOORD1_V  = 15,
+            TEXCOORD2_U  = 22,
+            TEXCOORD2_V  = 23,
+
+            INVALID      = 31,
+        };
+
+        BitField< 0, 5, Semantic> map_x;
+        BitField< 8, 5, Semantic> map_y;
+        BitField<16, 5, Semantic> map_z;
+        BitField<24, 5, Semantic> map_w;
+    } vs_output_attributes[7];
+
+    INSERT_PADDING_WORDS(0x11);
+
+    union {
+        BitField< 0, 16, u32> x;
+        BitField<16, 16, u32> y;
+    } viewport_corner;
+
+    INSERT_PADDING_WORDS(0xa7);
+
+    struct {
+        enum ColorFormat : u32 {
+            RGBA8    = 0,
+            RGB8     = 1,
+            RGBA5551 = 2,
+            RGB565   = 3,
+            RGBA4    = 4,
+        };
+
+        INSERT_PADDING_WORDS(0x6);
+
+        u32 depth_format;
+        u32 color_format;
+
+        INSERT_PADDING_WORDS(0x4);
+
+        u32 depth_buffer_address;
+        u32 color_buffer_address;
+
+        union {
+            // Apparently, the framebuffer width is stored as expected,
+            // while the height is stored as the actual height minus one.
+            // Hence, don't access these fields directly but use the accessors
+            // GetWidth() and GetHeight() instead.
+            BitField< 0, 11, u32> width;
+            BitField<12, 10, u32> height;
+        };
+
+        INSERT_PADDING_WORDS(0x1);
+
+        inline u32 GetColorBufferAddress() const {
+            return Memory::PhysicalToVirtualAddress(DecodeAddressRegister(color_buffer_address));
+        }
+        inline u32 GetDepthBufferAddress() const {
+            return Memory::PhysicalToVirtualAddress(DecodeAddressRegister(depth_buffer_address));
+        }
+
+        inline u32 GetWidth() const {
+            return width;
+        }
+
+        inline u32 GetHeight() const {
+            return height + 1;
+        }
+    } framebuffer;
+
+    INSERT_PADDING_WORDS(0xe0);
+
+    struct {
+        enum class Format : u64 {
+            BYTE = 0,
+            UBYTE = 1,
+            SHORT = 2,
+            FLOAT = 3,
+        };
+
+        BitField<0, 29, u32> base_address;
+
+        inline u32 GetBaseAddress() const {
+            // TODO: Ugly, should fix PhysicalToVirtualAddress instead
+            return DecodeAddressRegister(base_address) - Memory::FCRAM_PADDR + Memory::HEAP_GSP_VADDR;
+        }
+
+        // Descriptor for internal vertex attributes
+        union {
+            BitField< 0,  2, Format> format0; // size of one element
+            BitField< 2,  2, u64> size0;      // number of elements minus 1
+            BitField< 4,  2, Format> format1;
+            BitField< 6,  2, u64> size1;
+            BitField< 8,  2, Format> format2;
+            BitField<10,  2, u64> size2;
+            BitField<12,  2, Format> format3;
+            BitField<14,  2, u64> size3;
+            BitField<16,  2, Format> format4;
+            BitField<18,  2, u64> size4;
+            BitField<20,  2, Format> format5;
+            BitField<22,  2, u64> size5;
+            BitField<24,  2, Format> format6;
+            BitField<26,  2, u64> size6;
+            BitField<28,  2, Format> format7;
+            BitField<30,  2, u64> size7;
+            BitField<32,  2, Format> format8;
+            BitField<34,  2, u64> size8;
+            BitField<36,  2, Format> format9;
+            BitField<38,  2, u64> size9;
+            BitField<40,  2, Format> format10;
+            BitField<42,  2, u64> size10;
+            BitField<44,  2, Format> format11;
+            BitField<46,  2, u64> size11;
+
+            BitField<48, 12, u64> attribute_mask;
+
+            // number of total attributes minus 1
+            BitField<60,  4, u64> num_extra_attributes;
+        };
+
+        inline Format GetFormat(int n) const {
+            Format formats[] = {
+                format0, format1, format2, format3,
+                format4, format5, format6, format7,
+                format8, format9, format10, format11
+            };
+            return formats[n];
+        }
+
+        inline int GetNumElements(int n) const {
+            u64 sizes[] = {
+                size0, size1, size2, size3,
+                size4, size5, size6, size7,
+                size8, size9, size10, size11
+            };
+            return (int)sizes[n]+1;
+        }
+
+        inline int GetElementSizeInBytes(int n) const {
+            return (GetFormat(n) == Format::FLOAT) ? 4 :
+                (GetFormat(n) == Format::SHORT) ? 2 : 1;
+        }
+
+        inline int GetStride(int n) const {
+            return GetNumElements(n) * GetElementSizeInBytes(n);
+        }
+
+        inline int GetNumTotalAttributes() const {
+            return (int)num_extra_attributes+1;
+        }
+
+        // Attribute loaders map the source vertex data to input attributes
+        // This e.g. allows to load different attributes from different memory locations
+        struct {
+            // Source attribute data offset from the base address
+            u32 data_offset;
+
+            union {
+                BitField< 0, 4, u64> comp0;
+                BitField< 4, 4, u64> comp1;
+                BitField< 8, 4, u64> comp2;
+                BitField<12, 4, u64> comp3;
+                BitField<16, 4, u64> comp4;
+                BitField<20, 4, u64> comp5;
+                BitField<24, 4, u64> comp6;
+                BitField<28, 4, u64> comp7;
+                BitField<32, 4, u64> comp8;
+                BitField<36, 4, u64> comp9;
+                BitField<40, 4, u64> comp10;
+                BitField<44, 4, u64> comp11;
+
+                // bytes for a single vertex in this loader
+                BitField<48, 8, u64> byte_count;
+
+                BitField<60, 4, u64> component_count;
+            };
+
+            inline int GetComponent(int n) const {
+                u64 components[] = {
+                    comp0, comp1, comp2, comp3,
+                    comp4, comp5, comp6, comp7,
+                    comp8, comp9, comp10, comp11
+                };
+                return (int)components[n];
+            }
+        } attribute_loaders[12];
+    } vertex_attributes;
+
+    struct {
+        enum IndexFormat : u32 {
+            BYTE = 0,
+            SHORT = 1,
+        };
+
+        union {
+            BitField<0, 31, u32> offset; // relative to base attribute address
+            BitField<31, 1, IndexFormat> format;
+        };
+    } index_array;
+
+    // Number of vertices to render
+    u32 num_vertices;
+
+    INSERT_PADDING_WORDS(0x5);
+
+    // These two trigger rendering of triangles
+    u32 trigger_draw;
+    u32 trigger_draw_indexed;
+
+    INSERT_PADDING_WORDS(0x2e);
+
+    enum class TriangleTopology : u32 {
+        List        = 0,
+        Strip       = 1,
+        Fan         = 2,
+        ListIndexed = 3, // TODO: No idea if this is correct
     };
 
-    template<Id id>
-    union Struct;
-};
+    BitField<8, 2, TriangleTopology> triangle_topology;
 
-static inline Regs::Id VertexAttributeOffset(int n)
-{
-    return static_cast<Regs::Id>(0x203 + 3*n);
-}
+    INSERT_PADDING_WORDS(0x5b);
 
-static inline Regs::Id VertexAttributeInfo0(int n)
-{
-    return static_cast<Regs::Id>(0x204 + 3*n);
-}
+    // Offset to shader program entry point (in words)
+    BitField<0, 16, u32> vs_main_offset;
 
-static inline Regs::Id VertexAttributeInfo1(int n)
-{
-    return static_cast<Regs::Id>(0x205 + 3*n);
-}
+    union {
+        BitField< 0, 4, u64> attribute0_register;
+        BitField< 4, 4, u64> attribute1_register;
+        BitField< 8, 4, u64> attribute2_register;
+        BitField<12, 4, u64> attribute3_register;
+        BitField<16, 4, u64> attribute4_register;
+        BitField<20, 4, u64> attribute5_register;
+        BitField<24, 4, u64> attribute6_register;
+        BitField<28, 4, u64> attribute7_register;
+        BitField<32, 4, u64> attribute8_register;
+        BitField<36, 4, u64> attribute9_register;
+        BitField<40, 4, u64> attribute10_register;
+        BitField<44, 4, u64> attribute11_register;
+        BitField<48, 4, u64> attribute12_register;
+        BitField<52, 4, u64> attribute13_register;
+        BitField<56, 4, u64> attribute14_register;
+        BitField<60, 4, u64> attribute15_register;
 
-union CommandHeader {
-    CommandHeader(u32 h) : hex(h) {}
+        int GetRegisterForAttribute(int attribute_index) {
+            u64 fields[] = {
+                attribute0_register,  attribute1_register,  attribute2_register,  attribute3_register,
+                attribute4_register,  attribute5_register,  attribute6_register,  attribute7_register,
+                attribute8_register,  attribute9_register,  attribute10_register, attribute11_register,
+                attribute12_register, attribute13_register, attribute14_register, attribute15_register,
+            };
+            return (int)fields[attribute_index];
+        }
+    } vs_input_register_map;
 
-    u32 hex;
+    INSERT_PADDING_WORDS(0x3);
 
-    BitField< 0, 16, Regs::Id> cmd_id;
-    BitField<16,  4, u32> parameter_mask;
-    BitField<20, 11, u32> extra_data_length;
-    BitField<31,  1, u32> group_commands;
-};
+    struct {
+        enum Format : u32
+        {
+            FLOAT24 = 0,
+            FLOAT32 = 1
+        };
 
-static std::map<Regs::Id, const char*> command_names = {
-    {Regs::ViewportSizeX, "ViewportSizeX" },
-    {Regs::ViewportInvSizeX, "ViewportInvSizeX" },
-    {Regs::ViewportSizeY, "ViewportSizeY" },
-    {Regs::ViewportInvSizeY, "ViewportInvSizeY" },
-    {Regs::ViewportCorner, "ViewportCorner" },
-    {Regs::DepthBufferFormat, "DepthBufferFormat" },
-    {Regs::ColorBufferFormat, "ColorBufferFormat" },
-    {Regs::DepthBufferAddress, "DepthBufferAddress" },
-    {Regs::ColorBufferAddress, "ColorBufferAddress" },
-    {Regs::ColorBufferSize, "ColorBufferSize" },
-};
+        bool IsFloat32() const {
+            return format == FLOAT32;
+        }
+
+        union {
+            // Index of the next uniform to write to
+            // TODO: ctrulib uses 8 bits for this, however that seems to yield lots of invalid indices
+            BitField<0, 7, u32> index;
+
+            BitField<31, 1, Format> format;
+        };
+
+        // Writing to these registers sets the "current" uniform.
+        // TODO: It's not clear how the hardware stores what the "current" uniform is.
+        u32 set_value[8];
+
+    } vs_uniform_setup;
+
+    INSERT_PADDING_WORDS(0x2);
+
+    struct {
+        u32 begin_load;
+
+        // Writing to these registers sets the "current" word in the shader program.
+        // TODO: It's not clear how the hardware stores what the "current" word is.
+        u32 set_word[8];
+    } vs_program;
+
+    INSERT_PADDING_WORDS(0x1);
 
-template<>
-union Regs::Struct<Regs::ViewportSizeX> {
-    BitField<0, 24, u32> value;
+    // This register group is used to load an internal table of swizzling patterns,
+    // which are indexed by each shader instruction to specify vector component swizzling.
+    struct {
+        u32 begin_load;
+
+        // Writing to these registers sets the "current" swizzle pattern in the table.
+        // TODO: It's not clear how the hardware stores what the "current" swizzle pattern is.
+        u32 set_word[8];
+    } vs_swizzle_patterns;
+
+    INSERT_PADDING_WORDS(0x22);
+
+#undef INSERT_PADDING_WORDS_HELPER1
+#undef INSERT_PADDING_WORDS_HELPER2
+#undef INSERT_PADDING_WORDS
+
+    // Map register indices to names readable by humans
+    // Used for debugging purposes, so performance is not an issue here
+    static std::string GetCommandName(int index) {
+        std::map<u32, std::string> map;
+        Regs regs;
+
+        // TODO: MSVC does not support using offsetof() on non-static data members even though this
+        //       is technically allowed since C++11. Hence, this functionality is disabled until
+        //       MSVC properly supports it.
+        #ifndef _MSC_VER
+        #define ADD_FIELD(name)                                                                               \
+            do {                                                                                              \
+                map.insert({PICA_REG_INDEX(name), #name});                                                    \
+                for (u32 i = PICA_REG_INDEX(name) + 1; i < PICA_REG_INDEX(name) + sizeof(regs.name) / 4; ++i) \
+                    map.insert({i, #name + std::string("+") + std::to_string(i-PICA_REG_INDEX(name))});       \
+            } while(false)
+
+        ADD_FIELD(viewport_size_x);
+        ADD_FIELD(viewport_size_y);
+        ADD_FIELD(viewport_depth_range);
+        ADD_FIELD(viewport_depth_far_plane);
+        ADD_FIELD(viewport_corner);
+        ADD_FIELD(framebuffer);
+        ADD_FIELD(vertex_attributes);
+        ADD_FIELD(index_array);
+        ADD_FIELD(num_vertices);
+        ADD_FIELD(trigger_draw);
+        ADD_FIELD(trigger_draw_indexed);
+        ADD_FIELD(triangle_topology);
+        ADD_FIELD(vs_main_offset);
+        ADD_FIELD(vs_input_register_map);
+        ADD_FIELD(vs_uniform_setup);
+        ADD_FIELD(vs_program);
+        ADD_FIELD(vs_swizzle_patterns);
+
+        #undef ADD_FIELD
+        #endif // _MSC_VER
+
+        // Return empty string if no match is found
+        return map[index];
+    }
+
+    static inline int NumIds() {
+        return sizeof(Regs) / sizeof(u32);
+    }
+
+    u32& operator [] (int index) const {
+        u32* content = (u32*)this;
+        return content[index];
+    }
+
+    u32& operator [] (int index) {
+        u32* content = (u32*)this;
+        return content[index];
+    }
+
+private:
+    /*
+     * Most physical addresses which Pica registers refer to are 8-byte aligned.
+     * This function should be used to get the address from a raw register value.
+     */
+    static inline u32 DecodeAddressRegister(u32 register_value) {
+        return register_value * 8;
+    }
 };
 
-template<>
-union Regs::Struct<Regs::ViewportSizeY> {
-    BitField<0, 24, u32> value;
+// TODO: MSVC does not support using offsetof() on non-static data members even though this
+//       is technically allowed since C++11. This macro should be enabled once MSVC adds
+//       support for that.
+#ifndef _MSC_VER
+#define ASSERT_REG_POSITION(field_name, position) static_assert(offsetof(Regs, field_name) == position * 4, "Field "#field_name" has invalid position")
+
+ASSERT_REG_POSITION(viewport_size_x, 0x41);
+ASSERT_REG_POSITION(viewport_size_y, 0x43);
+ASSERT_REG_POSITION(viewport_depth_range, 0x4d);
+ASSERT_REG_POSITION(viewport_depth_far_plane, 0x4e);
+ASSERT_REG_POSITION(vs_output_attributes[0], 0x50);
+ASSERT_REG_POSITION(vs_output_attributes[1], 0x51);
+ASSERT_REG_POSITION(viewport_corner, 0x68);
+ASSERT_REG_POSITION(framebuffer, 0x110);
+ASSERT_REG_POSITION(vertex_attributes, 0x200);
+ASSERT_REG_POSITION(index_array, 0x227);
+ASSERT_REG_POSITION(num_vertices, 0x228);
+ASSERT_REG_POSITION(trigger_draw, 0x22e);
+ASSERT_REG_POSITION(trigger_draw_indexed, 0x22f);
+ASSERT_REG_POSITION(triangle_topology, 0x25e);
+ASSERT_REG_POSITION(vs_main_offset, 0x2ba);
+ASSERT_REG_POSITION(vs_input_register_map, 0x2bb);
+ASSERT_REG_POSITION(vs_uniform_setup, 0x2c0);
+ASSERT_REG_POSITION(vs_program, 0x2cb);
+ASSERT_REG_POSITION(vs_swizzle_patterns, 0x2d5);
+
+#undef ASSERT_REG_POSITION
+#endif // !defined(_MSC_VER)
+
+// The total number of registers is chosen arbitrarily, but let's make sure it's not some odd value anyway.
+static_assert(sizeof(Regs) <= 0x300 * sizeof(u32), "Register set structure larger than it should be");
+static_assert(sizeof(Regs) >= 0x300 * sizeof(u32), "Register set structure smaller than it should be");
+
+extern Regs registers; // TODO: Not sure if we want to have one global instance for this
+
+
+struct float24 {
+    static float24 FromFloat32(float val) {
+        float24 ret;
+        ret.value = val;
+        return ret;
+    }
+
+    // 16 bit mantissa, 7 bit exponent, 1 bit sign
+    // TODO: No idea if this works as intended
+    static float24 FromRawFloat24(u32 hex) {
+        float24 ret;
+        if ((hex & 0xFFFFFF) == 0) {
+            ret.value = 0;
+        } else {
+            u32 mantissa = hex & 0xFFFF;
+            u32 exponent = (hex >> 16) & 0x7F;
+            u32 sign = hex >> 23;
+            ret.value = powf(2.0f, (float)exponent-63.0f) * (1.0f + mantissa * powf(2.0f, -16.f));
+            if (sign)
+                ret.value = -ret.value;
+        }
+        return ret;
+    }
+
+    // Not recommended for anything but logging
+    float ToFloat32() const {
+        return value;
+    }
+
+    float24 operator * (const float24& flt) const {
+        return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
+    }
+
+    float24 operator / (const float24& flt) const {
+        return float24::FromFloat32(ToFloat32() / flt.ToFloat32());
+    }
+
+    float24 operator + (const float24& flt) const {
+        return float24::FromFloat32(ToFloat32() + flt.ToFloat32());
+    }
+
+    float24 operator - (const float24& flt) const {
+        return float24::FromFloat32(ToFloat32() - flt.ToFloat32());
+    }
+
+    float24 operator - () const {
+        return float24::FromFloat32(-ToFloat32());
+    }
+
+    bool operator < (const float24& flt) const {
+        return ToFloat32() < flt.ToFloat32();
+    }
+
+    bool operator > (const float24& flt) const {
+        return ToFloat32() > flt.ToFloat32();
+    }
+
+    bool operator >= (const float24& flt) const {
+        return ToFloat32() >= flt.ToFloat32();
+    }
+
+    bool operator <= (const float24& flt) const {
+        return ToFloat32() <= flt.ToFloat32();
+    }
+
+private:
+    float24() = default;
+
+    // Stored as a regular float, merely for convenience
+    // TODO: Perform proper arithmetic on this!
+    float value;
 };
 
-template<>
-union Regs::Struct<Regs::VertexDescriptor> {
-    enum class Format : u64 {
-        BYTE = 0,
-        UBYTE = 1,
-        SHORT = 2,
-        FLOAT = 3,
-    };
+union CommandHeader {
+    CommandHeader(u32 h) : hex(h) {}
+
+    u32 hex;
 
-    BitField< 0,  2, Format> format0;
-    BitField< 2,  2, u64> size0;      // number of elements minus 1
-    BitField< 4,  2, Format> format1;
-    BitField< 6,  2, u64> size1;
-    BitField< 8,  2, Format> format2;
-    BitField<10,  2, u64> size2;
-    BitField<12,  2, Format> format3;
-    BitField<14,  2, u64> size3;
-    BitField<16,  2, Format> format4;
-    BitField<18,  2, u64> size4;
-    BitField<20,  2, Format> format5;
-    BitField<22,  2, u64> size5;
-    BitField<24,  2, Format> format6;
-    BitField<26,  2, u64> size6;
-    BitField<28,  2, Format> format7;
-    BitField<30,  2, u64> size7;
-    BitField<32,  2, Format> format8;
-    BitField<34,  2, u64> size8;
-    BitField<36,  2, Format> format9;
-    BitField<38,  2, u64> size9;
-    BitField<40,  2, Format> format10;
-    BitField<42,  2, u64> size10;
-    BitField<44,  2, Format> format11;
-    BitField<46,  2, u64> size11;
-
-    BitField<48, 12, u64> attribute_mask;
-    BitField<60,  4, u64> num_attributes; // number of total attributes minus 1
+    BitField< 0, 16, u32> cmd_id;
+    BitField<16,  4, u32> parameter_mask;
+    BitField<20, 11, u32> extra_data_length;
+    BitField<31,  1, u32> group_commands;
 };
 
 
diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
new file mode 100644
index 000000000..2354ffb99
--- /dev/null
+++ b/src/video_core/primitive_assembly.cpp
@@ -0,0 +1,51 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include "clipper.h"
+#include "pica.h"
+#include "primitive_assembly.h"
+#include "vertex_shader.h"
+
+namespace Pica {
+
+namespace PrimitiveAssembly {
+
+static OutputVertex buffer[2];
+static int buffer_index = 0; // TODO: reset this on emulation restart
+
+void SubmitVertex(OutputVertex& vtx)
+{
+    switch (registers.triangle_topology) {
+        case Regs::TriangleTopology::List:
+        case Regs::TriangleTopology::ListIndexed:
+            if (buffer_index < 2) {
+                buffer[buffer_index++] = vtx;
+            } else {
+                buffer_index = 0;
+
+                Clipper::ProcessTriangle(buffer[0], buffer[1], vtx);
+            }
+            break;
+
+        case Regs::TriangleTopology::Fan:
+            if (buffer_index == 2) {
+                buffer_index = 0;
+
+                Clipper::ProcessTriangle(buffer[0], buffer[1], vtx);
+
+                buffer[1] = vtx;
+            } else {
+                buffer[buffer_index++] = vtx;
+            }
+            break;
+
+        default:
+            ERROR_LOG(GPU, "Unknown triangle mode %x:", (int)registers.triangle_topology.Value());
+            break;
+    }
+}
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h
new file mode 100644
index 000000000..2a2b0c170
--- /dev/null
+++ b/src/video_core/primitive_assembly.h
@@ -0,0 +1,21 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#pragma once
+
+namespace Pica {
+
+namespace VertexShader {
+    struct OutputVertex;
+}
+
+namespace PrimitiveAssembly {
+
+using VertexShader::OutputVertex;
+
+void SubmitVertex(OutputVertex& vtx);
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
new file mode 100644
index 000000000..a7c1bab3e
--- /dev/null
+++ b/src/video_core/rasterizer.cpp
@@ -0,0 +1,180 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include <algorithm>
+
+#include "common/common_types.h"
+
+#include "math.h"
+#include "pica.h"
+#include "rasterizer.h"
+#include "vertex_shader.h"
+
+namespace Pica {
+
+namespace Rasterizer {
+
+static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
+    u32* color_buffer = (u32*)Memory::GetPointer(registers.framebuffer.GetColorBufferAddress());
+    u32 value = (color.a() << 24) | (color.r() << 16) | (color.g() << 8) | color.b();
+
+    // Assuming RGBA8 format until actual framebuffer format handling is implemented
+    *(color_buffer + x + y * registers.framebuffer.GetWidth() / 2) = value;
+}
+
+static u32 GetDepth(int x, int y) {
+    u16* depth_buffer = (u16*)Memory::GetPointer(registers.framebuffer.GetDepthBufferAddress());
+
+    // Assuming 16-bit depth buffer format until actual format handling is implemented
+    return *(depth_buffer + x + y * registers.framebuffer.GetWidth() / 2);
+}
+
+static void SetDepth(int x, int y, u16 value) {
+    u16* depth_buffer = (u16*)Memory::GetPointer(registers.framebuffer.GetDepthBufferAddress());
+
+    // Assuming 16-bit depth buffer format until actual format handling is implemented
+    *(depth_buffer + x + y * registers.framebuffer.GetWidth() / 2) = value;
+}
+
+void ProcessTriangle(const VertexShader::OutputVertex& v0,
+                     const VertexShader::OutputVertex& v1,
+                     const VertexShader::OutputVertex& v2)
+{
+    // NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
+    struct Fix12P4 {
+        Fix12P4() {}
+        Fix12P4(u16 val) : val(val) {}
+
+        static u16 FracMask() { return 0xF; }
+        static u16 IntMask() { return (u16)~0xF; }
+
+        operator u16() const {
+            return val;
+        }
+
+        bool operator < (const Fix12P4& oth) const {
+            return (u16)*this < (u16)oth;
+        }
+
+    private:
+        u16 val;
+    };
+
+    // vertex positions in rasterizer coordinates
+    auto FloatToFix = [](float24 flt) {
+                          return Fix12P4(flt.ToFloat32() * 16.0f);
+                      };
+    auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) {
+                                             return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
+                                         };
+    Math::Vec3<Fix12P4> vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos),
+                                   ScreenToRasterizerCoordinates(v1.screenpos),
+                                   ScreenToRasterizerCoordinates(v2.screenpos) };
+
+    // TODO: Proper scissor rect test!
+    u16 min_x = std::min({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x});
+    u16 min_y = std::min({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y});
+    u16 max_x = std::max({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x});
+    u16 max_y = std::max({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y});
+
+    min_x = min_x & Fix12P4::IntMask();
+    min_y = min_y & Fix12P4::IntMask();
+    max_x = (max_x + Fix12P4::FracMask()) & Fix12P4::IntMask();
+    max_y = (max_y + Fix12P4::FracMask()) & Fix12P4::IntMask();
+
+    // Triangle filling rules: Pixels on the right-sided edge or on flat bottom edges are not
+    // drawn. Pixels on any other triangle border are drawn. This is implemented with three bias
+    // values which are added to the barycentric coordinates w0, w1 and w2, respectively.
+    // NOTE: These are the PSP filling rules. Not sure if the 3DS uses the same ones...
+    auto IsRightSideOrFlatBottomEdge = [](const Math::Vec2<Fix12P4>& vtx,
+                                          const Math::Vec2<Fix12P4>& line1,
+                                          const Math::Vec2<Fix12P4>& line2)
+    {
+        if (line1.y == line2.y) {
+            // just check if vertex is above us => bottom line parallel to x-axis
+            return vtx.y < line1.y;
+        } else {
+            // check if vertex is on our left => right side
+            // TODO: Not sure how likely this is to overflow
+            return (int)vtx.x < (int)line1.x + ((int)line2.x - (int)line1.x) * ((int)vtx.y - (int)line1.y) / ((int)line2.y - (int)line1.y);
+        }
+    };
+    int bias0 = IsRightSideOrFlatBottomEdge(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) ? -1 : 0;
+    int bias1 = IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0;
+    int bias2 = IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0;
+
+    // TODO: Not sure if looping through x first might be faster
+    for (u16 y = min_y; y < max_y; y += 0x10) {
+        for (u16 x = min_x; x < max_x; x += 0x10) {
+
+            // Calculate the barycentric coordinates w0, w1 and w2
+            auto orient2d = [](const Math::Vec2<Fix12P4>& vtx1,
+                               const Math::Vec2<Fix12P4>& vtx2,
+                               const Math::Vec2<Fix12P4>& vtx3) {
+                const auto vec1 = (vtx2.Cast<int>() - vtx1.Cast<int>()).Append(0);
+                const auto vec2 = (vtx3.Cast<int>() - vtx1.Cast<int>()).Append(0);
+                // TODO: There is a very small chance this will overflow for sizeof(int) == 4
+                return Cross(vec1, vec2).z;
+            };
+
+            int w0 = bias0 + orient2d(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
+            int w1 = bias1 + orient2d(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
+            int w2 = bias2 + orient2d(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
+            int wsum = w0 + w1 + w2;
+
+            // If current pixel is not covered by the current primitive
+            if (w0 < 0 || w1 < 0 || w2 < 0)
+                continue;
+
+            // Perspective correct attribute interpolation:
+            // Attribute values cannot be calculated by simple linear interpolation since
+            // they are not linear in screen space. For example, when interpolating a
+            // texture coordinate across two vertices, something simple like
+            //     u = (u0*w0 + u1*w1)/(w0+w1)
+            // will not work. However, the attribute value divided by the
+            // clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear
+            // in screenspace. Hence, we can linearly interpolate these two independently and
+            // calculate the interpolated attribute by dividing the results.
+            // I.e.
+            //     u_over_w   = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1)
+            //     one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1)
+            //     u = u_over_w / one_over_w
+            //
+            // The generalization to three vertices is straightforward in baricentric coordinates.
+            auto GetInterpolatedAttribute = [&](float24 attr0, float24 attr1, float24 attr2) {
+                auto attr_over_w = Math::MakeVec3(attr0 / v0.pos.w,
+                                                  attr1 / v1.pos.w,
+                                                  attr2 / v2.pos.w);
+                auto w_inverse   = Math::MakeVec3(float24::FromFloat32(1.f) / v0.pos.w,
+                                                  float24::FromFloat32(1.f) / v1.pos.w,
+                                                  float24::FromFloat32(1.f) / v2.pos.w);
+                auto baricentric_coordinates = Math::MakeVec3(float24::FromFloat32(w0),
+                                                              float24::FromFloat32(w1),
+                                                              float24::FromFloat32(w2));
+
+                float24 interpolated_attr_over_w = Math::Dot(attr_over_w, baricentric_coordinates);
+                float24 interpolated_w_inverse   = Math::Dot(w_inverse,   baricentric_coordinates);
+                return interpolated_attr_over_w / interpolated_w_inverse;
+            };
+
+            Math::Vec4<u8> primary_color{
+                (u8)(GetInterpolatedAttribute(v0.color.r(), v1.color.r(), v2.color.r()).ToFloat32() * 255),
+                (u8)(GetInterpolatedAttribute(v0.color.g(), v1.color.g(), v2.color.g()).ToFloat32() * 255),
+                (u8)(GetInterpolatedAttribute(v0.color.b(), v1.color.b(), v2.color.b()).ToFloat32() * 255),
+                (u8)(GetInterpolatedAttribute(v0.color.a(), v1.color.a(), v2.color.a()).ToFloat32() * 255)
+            };
+
+            u16 z = (u16)(((float)v0.screenpos[2].ToFloat32() * w0 +
+                           (float)v1.screenpos[2].ToFloat32() * w1 +
+                           (float)v2.screenpos[2].ToFloat32() * w2) * 65535.f / wsum); // TODO: Shouldn't need to multiply by 65536?
+            SetDepth(x >> 4, y >> 4, z);
+
+            DrawPixel(x >> 4, y >> 4, primary_color);
+        }
+    }
+}
+
+} // namespace Rasterizer
+
+} // namespace Pica
diff --git a/src/video_core/rasterizer.h b/src/video_core/rasterizer.h
new file mode 100644
index 000000000..500be9462
--- /dev/null
+++ b/src/video_core/rasterizer.h
@@ -0,0 +1,21 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#pragma once
+
+namespace Pica {
+
+namespace VertexShader {
+    struct OutputVertex;
+}
+
+namespace Rasterizer {
+
+void ProcessTriangle(const VertexShader::OutputVertex& v0,
+                     const VertexShader::OutputVertex& v1,
+                     const VertexShader::OutputVertex& v2);
+
+} // namespace Rasterizer
+
+} // namespace Pica
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index d0a8ec1da..f11a64fad 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -78,23 +78,23 @@ void RendererOpenGL::FlipFramebuffer(const u8* in, u8* out) {
  */
 void RendererOpenGL::RenderXFB(const common::Rect& src_rect, const common::Rect& dst_rect) {
 
-    const auto& framebuffer_top = GPU::g_regs.Get<GPU::Regs::FramebufferTop>();
-    const auto& framebuffer_sub = GPU::g_regs.Get<GPU::Regs::FramebufferBottom>();
+    const auto& framebuffer_top = GPU::g_regs.framebuffer_config[0];
+    const auto& framebuffer_sub = GPU::g_regs.framebuffer_config[1];
     const u32 active_fb_top = (framebuffer_top.active_fb == 1)
-                                ? framebuffer_top.address_left2
-                                : framebuffer_top.address_left1;
+                                ? Memory::PhysicalToVirtualAddress(framebuffer_top.address_left2)
+                                : Memory::PhysicalToVirtualAddress(framebuffer_top.address_left1);
     const u32 active_fb_sub = (framebuffer_sub.active_fb == 1)
-                                ? framebuffer_sub.address_left2
-                                : framebuffer_sub.address_left1;
+                                ? Memory::PhysicalToVirtualAddress(framebuffer_sub.address_left2)
+                                : Memory::PhysicalToVirtualAddress(framebuffer_sub.address_left1);
 
     DEBUG_LOG(GPU, "RenderXFB: 0x%08x bytes from 0x%08x(%dx%d), fmt %x",
               framebuffer_top.stride * framebuffer_top.height,
-              GPU::GetFramebufferAddr(active_fb_top), (int)framebuffer_top.width,
+              active_fb_top, (int)framebuffer_top.width,
               (int)framebuffer_top.height, (int)framebuffer_top.format);
 
     // TODO: This should consider the GPU registers for framebuffer width, height and stride.
-    FlipFramebuffer(GPU::GetFramebufferPointer(active_fb_top), m_xfb_top_flipped);
-    FlipFramebuffer(GPU::GetFramebufferPointer(active_fb_sub), m_xfb_bottom_flipped);
+    FlipFramebuffer(Memory::GetPointer(active_fb_top), m_xfb_top_flipped);
+    FlipFramebuffer(Memory::GetPointer(active_fb_sub), m_xfb_bottom_flipped);
 
     // Blit the top framebuffer
     // ------------------------
diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
new file mode 100644
index 000000000..93830a96a
--- /dev/null
+++ b/src/video_core/vertex_shader.cpp
@@ -0,0 +1,270 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include "pica.h"
+#include "vertex_shader.h"
+#include <core/mem_map.h>
+#include <common/file_util.h>
+
+namespace Pica {
+
+namespace VertexShader {
+
+static struct {
+    Math::Vec4<float24> f[96];
+} shader_uniforms;
+
+
+// TODO: Not sure where the shader binary and swizzle patterns are supposed to be loaded to!
+// For now, we just keep these local arrays around.
+static u32 shader_memory[1024];
+static u32 swizzle_data[1024];
+
+void SubmitShaderMemoryChange(u32 addr, u32 value)
+{
+    shader_memory[addr] = value;
+}
+
+void SubmitSwizzleDataChange(u32 addr, u32 value)
+{
+    swizzle_data[addr] = value;
+}
+
+Math::Vec4<float24>& GetFloatUniform(u32 index)
+{
+    return shader_uniforms.f[index];
+}
+
+struct VertexShaderState {
+    u32* program_counter;
+
+    const float24* input_register_table[16];
+    float24* output_register_table[7*4];
+
+    Math::Vec4<float24> temporary_registers[16];
+    bool status_registers[2];
+
+    enum {
+        INVALID_ADDRESS = 0xFFFFFFFF
+    };
+    u32 call_stack[8]; // TODO: What is the maximal call stack depth?
+    u32* call_stack_pointer;
+};
+
+static void ProcessShaderCode(VertexShaderState& state) {
+    while (true) {
+        bool increment_pc = true;
+        bool exit_loop = false;
+        const Instruction& instr = *(const Instruction*)state.program_counter;
+
+        const float24* src1_ = (instr.common.src1 < 0x10) ? state.input_register_table[instr.common.src1]
+                             : (instr.common.src1 < 0x20) ? &state.temporary_registers[instr.common.src1-0x10].x
+                             : (instr.common.src1 < 0x80) ? &shader_uniforms.f[instr.common.src1-0x20].x
+                             : nullptr;
+        const float24* src2_ = (instr.common.src2 < 0x10) ? state.input_register_table[instr.common.src2]
+                             : &state.temporary_registers[instr.common.src2-0x10].x;
+        // TODO: Unsure about the limit values
+        float24* dest = (instr.common.dest <= 0x1C) ? state.output_register_table[instr.common.dest]
+                             : (instr.common.dest <= 0x3C) ? nullptr
+                             : (instr.common.dest <= 0x7C) ? &state.temporary_registers[(instr.common.dest-0x40)/4][instr.common.dest%4]
+                             : nullptr;
+
+        const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
+
+        const float24 src1[4] = {
+            src1_[(int)swizzle.GetSelectorSrc1(0)],
+            src1_[(int)swizzle.GetSelectorSrc1(1)],
+            src1_[(int)swizzle.GetSelectorSrc1(2)],
+            src1_[(int)swizzle.GetSelectorSrc1(3)],
+        };
+        const float24 src2[4] = {
+            src2_[(int)swizzle.GetSelectorSrc2(0)],
+            src2_[(int)swizzle.GetSelectorSrc2(1)],
+            src2_[(int)swizzle.GetSelectorSrc2(2)],
+            src2_[(int)swizzle.GetSelectorSrc2(3)],
+        };
+
+        switch (instr.opcode) {
+            case Instruction::OpCode::ADD:
+            {
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = src1[i] + src2[i];
+                }
+
+                break;
+            }
+
+            case Instruction::OpCode::MUL:
+            {
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = src1[i] * src2[i];
+                }
+
+                break;
+            }
+
+            case Instruction::OpCode::DP3:
+            case Instruction::OpCode::DP4:
+            {
+                float24 dot = float24::FromFloat32(0.f);
+                int num_components = (instr.opcode == Instruction::OpCode::DP3) ? 3 : 4;
+                for (int i = 0; i < num_components; ++i)
+                    dot = dot + src1[i] * src2[i];
+
+                for (int i = 0; i < num_components; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = dot;
+                }
+                break;
+            }
+
+            // Reciprocal
+            case Instruction::OpCode::RCP:
+            {
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    // TODO: Be stable against division by zero!
+                    // TODO: I think this might be wrong... we should only use one component here
+                    dest[i] = float24::FromFloat32(1.0 / src1[i].ToFloat32());
+                }
+
+                break;
+            }
+
+            // Reciprocal Square Root
+            case Instruction::OpCode::RSQ:
+            {
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    // TODO: Be stable against division by zero!
+                    // TODO: I think this might be wrong... we should only use one component here
+                    dest[i] = float24::FromFloat32(1.0 / sqrt(src1[i].ToFloat32()));
+                }
+
+                break;
+            }
+
+            case Instruction::OpCode::MOV:
+            {
+                for (int i = 0; i < 4; ++i) {
+                    if (!swizzle.DestComponentEnabled(i))
+                        continue;
+
+                    dest[i] = src1[i];
+                }
+                break;
+            }
+
+            case Instruction::OpCode::RET:
+                if (*state.call_stack_pointer == VertexShaderState::INVALID_ADDRESS) {
+                    exit_loop = true;
+                } else {
+                    state.program_counter = &shader_memory[*state.call_stack_pointer--];
+                    *state.call_stack_pointer = VertexShaderState::INVALID_ADDRESS;
+                }
+
+                break;
+
+            case Instruction::OpCode::CALL:
+                increment_pc = false;
+
+                _dbg_assert_(GPU, state.call_stack_pointer - state.call_stack < sizeof(state.call_stack));
+
+                *++state.call_stack_pointer = state.program_counter - shader_memory;
+                // TODO: Does this offset refer to the beginning of shader memory?
+                state.program_counter = &shader_memory[instr.flow_control.offset_words];
+                break;
+
+            case Instruction::OpCode::FLS:
+                // TODO: Do whatever needs to be done here?
+                break;
+
+            default:
+                ERROR_LOG(GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
+                          (int)instr.opcode.Value(), instr.GetOpCodeName().c_str(), instr.hex);
+                break;
+        }
+
+        if (increment_pc)
+            ++state.program_counter;
+
+        if (exit_loop)
+            break;
+    }
+}
+
+OutputVertex RunShader(const InputVertex& input, int num_attributes)
+{
+    VertexShaderState state;
+
+    const u32* main = &shader_memory[registers.vs_main_offset];
+    state.program_counter = (u32*)main;
+
+    // Setup input register table
+    const auto& attribute_register_map = registers.vs_input_register_map;
+    float24 dummy_register;
+    std::fill(&state.input_register_table[0], &state.input_register_table[16], &dummy_register);
+    if(num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x;
+    if(num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x;
+    if(num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x;
+    if(num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x;
+    if(num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x;
+    if(num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x;
+    if(num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x;
+    if(num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x;
+    if(num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x;
+    if(num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x;
+    if(num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x;
+    if(num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x;
+    if(num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x;
+    if(num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x;
+    if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x;
+    if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x;
+
+    // Setup output register table
+    OutputVertex ret;
+    for (int i = 0; i < 7; ++i) {
+        const auto& output_register_map = registers.vs_output_attributes[i];
+
+        u32 semantics[4] = {
+            output_register_map.map_x, output_register_map.map_y,
+            output_register_map.map_z, output_register_map.map_w
+        };
+
+        for (int comp = 0; comp < 4; ++comp)
+            state.output_register_table[4*i+comp] = ((float24*)&ret) + semantics[comp];
+    }
+
+    state.status_registers[0] = false;
+    state.status_registers[1] = false;
+    std::fill(state.call_stack, state.call_stack + sizeof(state.call_stack) / sizeof(state.call_stack[0]),
+              VertexShaderState::INVALID_ADDRESS);
+    state.call_stack_pointer = &state.call_stack[0];
+
+    ProcessShaderCode(state);
+
+    DEBUG_LOG(GPU, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
+        ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
+        ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),
+        ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32());
+
+    return ret;
+}
+
+
+} // namespace
+
+} // namespace
diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h
new file mode 100644
index 000000000..1b71e367b
--- /dev/null
+++ b/src/video_core/vertex_shader.h
@@ -0,0 +1,211 @@
+// Copyright 2014 Citra Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <initializer_list>
+
+#include <common/common_types.h>
+
+#include "math.h"
+#include "pica.h"
+
+namespace Pica {
+
+namespace VertexShader {
+
+struct InputVertex {
+    Math::Vec4<float24> attr[16];
+};
+
+struct OutputVertex {
+    OutputVertex() = default;
+
+    // VS output attributes
+    Math::Vec4<float24> pos;
+    Math::Vec4<float24> dummy; // quaternions (not implemented, yet)
+    Math::Vec4<float24> color;
+    Math::Vec2<float24> tc0;
+    float24 tc0_v;
+
+    // Padding for optimal alignment
+    float24 pad[14];
+
+    // Attributes used to store intermediate results
+
+    // position after perspective divide
+    Math::Vec3<float24> screenpos;
+
+    // Linear interpolation
+    // factor: 0=this, 1=vtx
+    void Lerp(float24 factor, const OutputVertex& vtx) {
+        pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor);
+
+        // TODO: Should perform perspective correct interpolation here...
+        tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor);
+
+        screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor);
+
+        color = color * factor + vtx.color * (float24::FromFloat32(1) - factor);
+    }
+
+    // Linear interpolation
+    // factor: 0=v0, 1=v1
+    static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) {
+        OutputVertex ret = v0;
+        ret.Lerp(factor, v1);
+        return ret;
+    }
+};
+static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
+
+union Instruction {
+    enum class OpCode : u32 {
+        ADD = 0x0,
+        DP3 = 0x1,
+        DP4 = 0x2,
+
+        MUL = 0x8,
+
+        MAX = 0xC,
+        MIN = 0xD,
+        RCP = 0xE,
+        RSQ = 0xF,
+
+        MOV = 0x13,
+
+        RET = 0x21,
+        FLS = 0x22, // Flush
+        CALL = 0x24,
+    };
+
+    std::string GetOpCodeName() const {
+        std::map<OpCode, std::string> map = {
+            { OpCode::ADD, "ADD" },
+            { OpCode::DP3, "DP3" },
+            { OpCode::DP4, "DP4" },
+            { OpCode::MUL, "MUL" },
+            { OpCode::MAX, "MAX" },
+            { OpCode::MIN, "MIN" },
+            { OpCode::RCP, "RCP" },
+            { OpCode::RSQ, "RSQ" },
+            { OpCode::MOV, "MOV" },
+            { OpCode::RET, "RET" },
+            { OpCode::FLS, "FLS" },
+        };
+        auto it = map.find(opcode);
+        if (it == map.end())
+            return "UNK";
+        else
+            return it->second;
+    }
+
+    u32 hex;
+
+    BitField<0x1a, 0x6, OpCode> opcode;
+
+    // General notes:
+    //
+    // When two input registers are used, one of them uses a 5-bit index while the other
+    // one uses a 7-bit index. This is because at most one floating point uniform may be used
+    // as an input.
+
+
+    // Format used e.g. by arithmetic instructions and comparisons
+    // "src1" and "src2" specify register indices (i.e. indices referring to groups of 4 floats),
+    // while "dest" addresses individual floats.
+    union {
+        BitField<0x00, 0x5, u32> operand_desc_id;
+        BitField<0x07, 0x5, u32> src2;
+        BitField<0x0c, 0x7, u32> src1;
+        BitField<0x13, 0x7, u32> dest;
+    } common;
+
+    // Format used for flow control instructions ("if")
+    union {
+        BitField<0x00, 0x8, u32> num_instructions;
+        BitField<0x0a, 0xc, u32> offset_words;
+    } flow_control;
+};
+
+union SwizzlePattern {
+    u32 hex;
+
+    enum class Selector : u32 {
+        x = 0,
+        y = 1,
+        z = 2,
+        w = 3
+    };
+
+    Selector GetSelectorSrc1(int comp) const {
+        Selector selectors[] = {
+            src1_selector_0, src1_selector_1, src1_selector_2, src1_selector_3
+        };
+        return selectors[comp];
+    }
+
+    Selector GetSelectorSrc2(int comp) const {
+        Selector selectors[] = {
+            src2_selector_0, src2_selector_1, src2_selector_2, src2_selector_3
+        };
+        return selectors[comp];
+    }
+
+    bool DestComponentEnabled(int i) const {
+        return (dest_mask & (0x8 >> i));
+    }
+
+    std::string SelectorToString(bool src2) const {
+        std::map<Selector, std::string> map = {
+            { Selector::x, "x" },
+            { Selector::y, "y" },
+            { Selector::z, "z" },
+            { Selector::w, "w" }
+        };
+        std::string ret;
+        for (int i = 0; i < 4; ++i) {
+            ret += map.at(src2 ? GetSelectorSrc2(i) : GetSelectorSrc1(i));
+        }
+        return ret;
+    }
+
+    std::string DestMaskToString() const {
+        std::string ret;
+        for (int i = 0; i < 4; ++i) {
+            if (!DestComponentEnabled(i))
+                ret += "_";
+            else
+                ret += "xyzw"[i];
+        }
+        return ret;
+    }
+
+    // Components of "dest" that should be written to: LSB=dest.w, MSB=dest.x
+    BitField< 0, 4, u32> dest_mask;
+
+    BitField< 5, 2, Selector> src1_selector_3;
+    BitField< 7, 2, Selector> src1_selector_2;
+    BitField< 9, 2, Selector> src1_selector_1;
+    BitField<11, 2, Selector> src1_selector_0;
+
+    BitField<14, 2, Selector> src2_selector_3;
+    BitField<16, 2, Selector> src2_selector_2;
+    BitField<18, 2, Selector> src2_selector_1;
+    BitField<20, 2, Selector> src2_selector_0;
+
+    BitField<31, 1, u32> flag; // not sure what this means, maybe it's the sign?
+};
+
+void SubmitShaderMemoryChange(u32 addr, u32 value);
+void SubmitSwizzleDataChange(u32 addr, u32 value);
+
+OutputVertex RunShader(const InputVertex& input, int num_attributes);
+
+Math::Vec4<float24>& GetFloatUniform(u32 index);
+
+} // namespace
+
+} // namespace
+
diff --git a/src/video_core/video_core.vcxproj b/src/video_core/video_core.vcxproj
index d77be2bef..48d77cdc4 100644
--- a/src/video_core/video_core.vcxproj
+++ b/src/video_core/video_core.vcxproj
@@ -20,14 +20,25 @@
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="renderer_opengl\renderer_opengl.cpp" />
+    <ClCompile Include="clipper.cpp" />
+    <ClCompile Include="command_processor.cpp" />
+    <ClCompile Include="primitive_assembly.cpp" />
+    <ClCompile Include="rasterizer.cpp" />
     <ClCompile Include="utils.cpp" />
+    <ClCompile Include="vertex_shader.cpp" />
     <ClCompile Include="video_core.cpp" />
   </ItemGroup>
   <ItemGroup>
+    <ClInclude Include="clipper.h" />
+    <ClInclude Include="command_processor.h" />
     <ClInclude Include="gpu_debugger.h" />
+    <ClInclude Include="math.h" />
     <ClInclude Include="pica.h" />
+    <ClInclude Include="primitive_assembly.h" />
+    <ClInclude Include="rasterizer.h" />
     <ClInclude Include="renderer_base.h" />
     <ClInclude Include="utils.h" />
+    <ClInclude Include="vertex_shader.h" />
     <ClInclude Include="video_core.h" />
     <ClInclude Include="renderer_opengl\renderer_opengl.h" />
   </ItemGroup>
diff --git a/src/video_core/video_core.vcxproj.filters b/src/video_core/video_core.vcxproj.filters
index b89ac1ac4..31af4f1df 100644
--- a/src/video_core/video_core.vcxproj.filters
+++ b/src/video_core/video_core.vcxproj.filters
@@ -9,17 +9,28 @@
     <ClCompile Include="renderer_opengl\renderer_opengl.cpp">
       <Filter>renderer_opengl</Filter>
     </ClCompile>
+    <ClCompile Include="clipper.cpp" />
+    <ClCompile Include="command_processor.cpp" />
+    <ClCompile Include="primitive_assembly.cpp" />
+    <ClCompile Include="rasterizer.cpp" />
     <ClCompile Include="utils.cpp" />
+    <ClCompile Include="vertex_shader.cpp" />
     <ClCompile Include="video_core.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="renderer_opengl\renderer_opengl.h">
       <Filter>renderer_opengl</Filter>
     </ClInclude>
+    <ClInclude Include="clipper.h" />
+    <ClInclude Include="command_processor.h" />
     <ClInclude Include="gpu_debugger.h" />
+    <ClInclude Include="math.h" />
     <ClInclude Include="pica.h" />
+    <ClInclude Include="primitive_assembly.h" />
+    <ClInclude Include="rasterizer.h" />
     <ClInclude Include="renderer_base.h" />
     <ClInclude Include="utils.h" />
+    <ClInclude Include="vertex_shader.h" />
     <ClInclude Include="video_core.h" />
   </ItemGroup>
   <ItemGroup>