25 files changed, 914 insertions, 663 deletions
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 31ea3adad..dc485e811 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -29,10 +29,10 @@ enum class BufferMethods {
 };
 
 void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params) {
-    LOG_WARNING(HW_GPU,
-                "Processing method {:08X} on subchannel {} value "
-                "{:08X} remaining params {}",
-                method, subchannel, value, remaining_params);
+    LOG_TRACE(HW_GPU,
+              "Processing method {:08X} on subchannel {} value "
+              "{:08X} remaining params {}",
+              method, subchannel, value, remaining_params);
 
     if (method == static_cast<u32>(BufferMethods::BindObject)) {
         // Bind the current subchannel to the desired engine id.
diff --git a/src/video_core/command_processor.h b/src/video_core/command_processor.h
index f7214ffec..a01153e0b 100644
--- a/src/video_core/command_processor.h
+++ b/src/video_core/command_processor.h
@@ -30,8 +30,7 @@ union CommandHeader {
 
     BitField<29, 3, SubmissionMode> mode;
 };
-static_assert(std::is_standard_layout<CommandHeader>::value == true,
-              "CommandHeader does not use standard layout");
+static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout");
 static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");
 
 } // namespace Tegra
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index a235b543e..a46ed4bd7 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -23,12 +23,17 @@ Maxwell3D::Maxwell3D(VideoCore::RasterizerInterface& rasterizer, MemoryManager&
     : memory_manager(memory_manager), rasterizer{rasterizer}, macro_interpreter(*this) {}
 
 void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
-    auto macro_code = uploaded_macros.find(method);
+    // Reset the current macro.
+    executing_macro = 0;
+
     // The requested macro must have been uploaded already.
-    ASSERT_MSG(macro_code != uploaded_macros.end(), "Macro %08X was not uploaded", method);
+    auto macro_code = uploaded_macros.find(method);
+    if (macro_code == uploaded_macros.end()) {
+        LOG_ERROR(HW_GPU, "Macro {:04X} was not uploaded", method);
+        return;
+    }
 
-    // Reset the current macro and execute it.
-    executing_macro = 0;
+    // Execute the current macro.
     macro_interpreter.Execute(macro_code->second, std::move(parameters));
 }
 
@@ -238,6 +243,8 @@ void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
 
     auto& buffer = shader.const_buffers[bind_data.index];
 
+    ASSERT(bind_data.index < Regs::MaxConstBuffers);
+
     buffer.enabled = bind_data.valid.Value() != 0;
     buffer.index = bind_data.index;
     buffer.address = regs.const_buffer.BufferAddress();
@@ -285,8 +292,6 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
 
     // TODO(Subv): Different data types for separate components are not supported
     ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
-    // TODO(Subv): Only UNORM formats are supported for now.
-    ASSERT(r_type == Texture::ComponentType::UNORM);
 
     return tic_entry;
 }
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 4d0ff96a5..1b30ce018 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -44,7 +44,7 @@ public:
         static constexpr size_t MaxShaderProgram = 6;
         static constexpr size_t MaxShaderStage = 5;
         // Maximum number of const buffers per shader stage.
-        static constexpr size_t MaxConstBuffers = 16;
+        static constexpr size_t MaxConstBuffers = 18;
 
         enum class QueryMode : u32 {
             Write = 0,
@@ -93,6 +93,7 @@ public:
 
         struct VertexAttribute {
             enum class Size : u32 {
+                Invalid = 0x0,
                 Size_32_32_32_32 = 0x01,
                 Size_32_32_32 = 0x02,
                 Size_16_16_16_16 = 0x03,
@@ -257,6 +258,10 @@ public:
             bool IsNormalized() const {
                 return (type == Type::SignedNorm) || (type == Type::UnsignedNorm);
             }
+
+            bool IsValid() const {
+                return size != Size::Invalid;
+            }
         };
 
         enum class PrimitiveTopology : u32 {
@@ -352,6 +357,27 @@ public:
                 OneMinusConstantColor = 0x62,
                 ConstantAlpha = 0x63,
                 OneMinusConstantAlpha = 0x64,
+
+                // These values are used by Nouveau and some games.
+                ZeroGL = 0x4000,
+                OneGL = 0x4001,
+                SourceColorGL = 0x4300,
+                OneMinusSourceColorGL = 0x4301,
+                SourceAlphaGL = 0x4302,
+                OneMinusSourceAlphaGL = 0x4303,
+                DestAlphaGL = 0x4304,
+                OneMinusDestAlphaGL = 0x4305,
+                DestColorGL = 0x4306,
+                OneMinusDestColorGL = 0x4307,
+                SourceAlphaSaturateGL = 0x4308,
+                ConstantColorGL = 0xc001,
+                OneMinusConstantColorGL = 0xc002,
+                ConstantAlphaGL = 0xc003,
+                OneMinusConstantAlphaGL = 0xc004,
+                Source1ColorGL = 0xc900,
+                OneMinusSource1ColorGL = 0xc901,
+                Source1AlphaGL = 0xc902,
+                OneMinusSource1AlphaGL = 0xc903,
             };
 
             u32 separate_alpha;
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index c7e3fb4b1..2526ebf28 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -74,6 +74,7 @@ union Attribute {
     enum class Index : u64 {
         Position = 7,
         Attribute_0 = 8,
+        Attribute_31 = 39,
         // This attribute contains a tuple of (~, ~, InstanceId, VertexId) when inside a vertex
         // shader, and a tuple of (TessCoord.x, TessCoord.y, TessCoord.z, ~) when inside a Tess Eval
         // shader.
@@ -199,6 +200,14 @@ enum class IMinMaxExchange : u64 {
     XHi = 3,
 };
 
+enum class XmadMode : u64 {
+    None = 0,
+    CLo = 1,
+    CHi = 2,
+    CSfu = 3,
+    CBcc = 4,
+};
+
 enum class FlowCondition : u64 {
     Always = 0xF,
     Fcsm_Tr = 0x1C, // TODO(bunnei): What is this used for?
@@ -254,20 +263,15 @@ union Instruction {
             BitField<56, 1, u64> invert_b;
         } lop32i;
 
-        float GetImm20_19() const {
-            float result{};
+        u32 GetImm20_19() const {
             u32 imm{static_cast<u32>(imm20_19)};
             imm <<= 12;
             imm |= negate_imm ? 0x80000000 : 0;
-            std::memcpy(&result, &imm, sizeof(imm));
-            return result;
+            return imm;
         }
 
-        float GetImm20_32() const {
-            float result{};
-            s32 imm{static_cast<s32>(imm20_32)};
-            std::memcpy(&result, &imm, sizeof(imm));
-            return result;
+        u32 GetImm20_32() const {
+            return static_cast<u32>(imm20_32);
         }
 
         s32 GetSignedImm20_20() const {
@@ -461,6 +465,18 @@ union Instruction {
     } bra;
 
     union {
+        BitField<20, 16, u64> imm20_16;
+        BitField<36, 1, u64> product_shift_left;
+        BitField<37, 1, u64> merge_37;
+        BitField<48, 1, u64> sign_a;
+        BitField<49, 1, u64> sign_b;
+        BitField<50, 3, XmadMode> mode;
+        BitField<52, 1, u64> high_b;
+        BitField<53, 1, u64> high_a;
+        BitField<56, 1, u64> merge_56;
+    } xmad;
+
+    union {
         BitField<20, 14, u64> offset;
         BitField<34, 5, u64> index;
     } cbuf34;
@@ -480,8 +496,7 @@ union Instruction {
     u64 value;
 };
 static_assert(sizeof(Instruction) == 0x8, "Incorrect structure size");
-static_assert(std::is_standard_layout<Instruction>::value,
-              "Structure does not have standard layout");
+static_assert(std::is_standard_layout_v<Instruction>, "Instruction is not standard layout");
 
 class OpCode {
 public:
@@ -598,9 +613,17 @@ public:
         IntegerSetPredicate,
         PredicateSetPredicate,
         Conversion,
+        Xmad,
         Unknown,
     };
 
+    /// Returns whether an opcode has an execution predicate field or not (ie, whether it can be
+    /// conditionally executed).
+    static bool IsPredicatedInstruction(Id opcode) {
+        // TODO(Subv): Add the rest of unpredicated instructions.
+        return opcode != Id::SSY;
+    }
+
     class Matcher {
     public:
         Matcher(const char* const name, u16 mask, u16 expected, OpCode::Id id, OpCode::Type type)
@@ -780,10 +803,10 @@ private:
             INST("010010110101----", Id::ISET_C, Type::IntegerSet, "ISET_C"),
             INST("0011011-0101----", Id::ISET_IMM, Type::IntegerSet, "ISET_IMM"),
             INST("0101000010010---", Id::PSETP, Type::PredicateSetPredicate, "PSETP"),
-            INST("0011011-00------", Id::XMAD_IMM, Type::Arithmetic, "XMAD_IMM"),
-            INST("0100111---------", Id::XMAD_CR, Type::Arithmetic, "XMAD_CR"),
-            INST("010100010-------", Id::XMAD_RC, Type::Arithmetic, "XMAD_RC"),
-            INST("0101101100------", Id::XMAD_RR, Type::Arithmetic, "XMAD_RR"),
+            INST("0011011-00------", Id::XMAD_IMM, Type::Xmad, "XMAD_IMM"),
+            INST("0100111---------", Id::XMAD_CR, Type::Xmad, "XMAD_CR"),
+            INST("010100010-------", Id::XMAD_RC, Type::Xmad, "XMAD_RC"),
+            INST("0101101100------", Id::XMAD_RR, Type::Xmad, "XMAD_RR"),
         };
 #undef INST
         std::stable_sort(table.begin(), table.end(), [](const auto& a, const auto& b) {
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index b2a83ce0b..5a593c1f7 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include "common/assert.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
@@ -11,6 +12,15 @@
 
 namespace Tegra {
 
+u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
+    switch (format) {
+    case PixelFormat::ABGR8:
+        return 4;
+    }
+
+    UNREACHABLE();
+}
+
 GPU::GPU(VideoCore::RasterizerInterface& rasterizer) {
     memory_manager = std::make_unique<MemoryManager>();
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(rasterizer, *memory_manager);
@@ -34,18 +44,59 @@ u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
 
     switch (format) {
     case RenderTargetFormat::RGBA32_FLOAT:
+    case RenderTargetFormat::RGBA32_UINT:
         return 16;
+    case RenderTargetFormat::RGBA16_UINT:
+    case RenderTargetFormat::RGBA16_UNORM:
     case RenderTargetFormat::RGBA16_FLOAT:
     case RenderTargetFormat::RG32_FLOAT:
+    case RenderTargetFormat::RG32_UINT:
         return 8;
     case RenderTargetFormat::RGBA8_UNORM:
+    case RenderTargetFormat::RGBA8_SNORM:
+    case RenderTargetFormat::RGBA8_SRGB:
     case RenderTargetFormat::RGB10_A2_UNORM:
     case RenderTargetFormat::BGRA8_UNORM:
+    case RenderTargetFormat::RG16_UNORM:
+    case RenderTargetFormat::RG16_SNORM:
+    case RenderTargetFormat::RG16_UINT:
+    case RenderTargetFormat::RG16_SINT:
+    case RenderTargetFormat::RG16_FLOAT:
     case RenderTargetFormat::R32_FLOAT:
+    case RenderTargetFormat::R11G11B10_FLOAT:
+    case RenderTargetFormat::R32_UINT:
         return 4;
+    case RenderTargetFormat::R16_UNORM:
+    case RenderTargetFormat::R16_SNORM:
+    case RenderTargetFormat::R16_UINT:
+    case RenderTargetFormat::R16_SINT:
+    case RenderTargetFormat::R16_FLOAT:
+    case RenderTargetFormat::RG8_UNORM:
+    case RenderTargetFormat::RG8_SNORM:
+        return 2;
+    case RenderTargetFormat::R8_UNORM:
+    case RenderTargetFormat::R8_UINT:
+        return 1;
     default:
         UNIMPLEMENTED_MSG("Unimplemented render target format {}", static_cast<u32>(format));
     }
 }
 
+u32 DepthFormatBytesPerPixel(DepthFormat format) {
+    switch (format) {
+    case DepthFormat::Z32_S8_X24_FLOAT:
+        return 8;
+    case DepthFormat::Z32_FLOAT:
+    case DepthFormat::S8_Z24_UNORM:
+    case DepthFormat::Z24_X8_UNORM:
+    case DepthFormat::Z24_S8_UNORM:
+    case DepthFormat::Z24_C8_UNORM:
+        return 4;
+    case DepthFormat::Z16_UNORM:
+        return 2;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented Depth format {}", static_cast<u32>(format));
+    }
+}
+
 } // namespace Tegra
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 440505c9d..97dcccb92 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -6,7 +6,6 @@
 
 #include <memory>
 #include <unordered_map>
-#include <vector>
 #include "common/common_types.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "video_core/memory_manager.h"
@@ -21,21 +20,34 @@ enum class RenderTargetFormat : u32 {
     NONE = 0x0,
     RGBA32_FLOAT = 0xC0,
     RGBA32_UINT = 0xC2,
+    RGBA16_UNORM = 0xC6,
+    RGBA16_UINT = 0xC9,
     RGBA16_FLOAT = 0xCA,
     RG32_FLOAT = 0xCB,
+    RG32_UINT = 0xCD,
     BGRA8_UNORM = 0xCF,
     RGB10_A2_UNORM = 0xD1,
     RGBA8_UNORM = 0xD5,
     RGBA8_SRGB = 0xD6,
+    RGBA8_SNORM = 0xD7,
     RG16_UNORM = 0xDA,
     RG16_SNORM = 0xDB,
     RG16_SINT = 0xDC,
     RG16_UINT = 0xDD,
     RG16_FLOAT = 0xDE,
     R11G11B10_FLOAT = 0xE0,
+    R32_UINT = 0xE4,
     R32_FLOAT = 0xE5,
+    B5G6R5_UNORM = 0xE8,
+    RG8_UNORM = 0xEA,
+    RG8_SNORM = 0xEB,
+    R16_UNORM = 0xEE,
+    R16_SNORM = 0xEF,
+    R16_SINT = 0xF0,
+    R16_UINT = 0xF1,
     R16_FLOAT = 0xF2,
     R8_UNORM = 0xF3,
+    R8_UINT = 0xF6,
 };
 
 enum class DepthFormat : u32 {
@@ -51,6 +63,9 @@ enum class DepthFormat : u32 {
 /// Returns the number of bytes per pixel of each rendertarget format.
 u32 RenderTargetBytesPerPixel(RenderTargetFormat format);
 
+/// Returns the number of bytes per pixel of each depth format.
+u32 DepthFormatBytesPerPixel(DepthFormat format);
+
 class DebugContext;
 
 /**
@@ -64,14 +79,7 @@ struct FramebufferConfig {
     /**
      * Returns the number of bytes per pixel.
      */
-    static u32 BytesPerPixel(PixelFormat format) {
-        switch (format) {
-        case PixelFormat::ABGR8:
-            return 4;
-        }
-
-        UNREACHABLE();
-    }
+    static u32 BytesPerPixel(PixelFormat format);
 
     VAddr address;
     u32 offset;
diff --git a/src/video_core/renderer_base.cpp b/src/video_core/renderer_base.cpp
index 3ca350243..afd86a83a 100644
--- a/src/video_core/renderer_base.cpp
+++ b/src/video_core/renderer_base.cpp
@@ -4,18 +4,23 @@
 
 #include <memory>
 #include "core/frontend/emu_window.h"
+#include "core/settings.h"
 #include "video_core/renderer_base.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 
 namespace VideoCore {
 
-RendererBase::RendererBase(EmuWindow& window) : render_window{window} {}
+RendererBase::RendererBase(Core::Frontend::EmuWindow& window) : render_window{window} {
+    RefreshBaseSettings();
+}
+
 RendererBase::~RendererBase() = default;
 
-void RendererBase::UpdateCurrentFramebufferLayout() {
-    const Layout::FramebufferLayout& layout = render_window.GetFramebufferLayout();
+void RendererBase::RefreshBaseSettings() {
+    RefreshRasterizerSetting();
+    UpdateCurrentFramebufferLayout();
 
-    render_window.UpdateCurrentFramebufferLayout(layout.width, layout.height);
+    renderer_settings.use_framelimiter = Settings::values.toggle_framelimit;
 }
 
 void RendererBase::RefreshRasterizerSetting() {
@@ -24,4 +29,10 @@ void RendererBase::RefreshRasterizerSetting() {
     }
 }
 
+void RendererBase::UpdateCurrentFramebufferLayout() {
+    const Layout::FramebufferLayout& layout = render_window.GetFramebufferLayout();
+
+    render_window.UpdateCurrentFramebufferLayout(layout.width, layout.height);
+}
+
 } // namespace VideoCore
diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h
index 235de23a1..d9f16b8e6 100644
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -4,23 +4,26 @@
 
 #pragma once
 
+#include <atomic>
 #include <memory>
 #include <boost/optional.hpp>
-#include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/gpu.h"
 #include "video_core/rasterizer_interface.h"
 
+namespace Core::Frontend {
 class EmuWindow;
+}
 
 namespace VideoCore {
 
+struct RendererSettings {
+    std::atomic_bool use_framelimiter{false};
+};
+
 class RendererBase : NonCopyable {
 public:
-    /// Used to reference a framebuffer
-    enum kFramebuffer { kFramebuffer_VirtualXFB = 0, kFramebuffer_EFB, kFramebuffer_Texture };
-
-    explicit RendererBase(EmuWindow& window);
+    explicit RendererBase(Core::Frontend::EmuWindow& window);
     virtual ~RendererBase();
 
     /// Swap buffers (render frame)
@@ -32,9 +35,6 @@ public:
     /// Shutdown the renderer
     virtual void ShutDown() = 0;
 
-    /// Updates the framebuffer layout of the contained render window handle.
-    void UpdateCurrentFramebufferLayout();
-
     // Getter/setter functions:
     // ------------------------
 
@@ -54,13 +54,23 @@ public:
         return *rasterizer;
     }
 
-    void RefreshRasterizerSetting();
+    /// Refreshes the settings common to all renderers
+    void RefreshBaseSettings();
 
 protected:
-    EmuWindow& render_window; ///< Reference to the render window handle.
+    /// Refreshes settings specific to the rasterizer.
+    void RefreshRasterizerSetting();
+
+    Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle.
     std::unique_ptr<RasterizerInterface> rasterizer;
     f32 m_current_fps = 0.0f; ///< Current framerate, should be set by the renderer
     int m_current_frame = 0;  ///< Current frame, should be set by the renderer
+
+    RendererSettings renderer_settings;
+
+private:
+    /// Updates the framebuffer layout of the contained render window handle.
+    void UpdateCurrentFramebufferLayout();
 };
 
 } // namespace VideoCore
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index c2a931469..52a649e2f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -36,30 +36,21 @@ MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255));
 MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
 
-RasterizerOpenGL::RasterizerOpenGL(EmuWindow& window) : emu_window{window} {
+RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window)
+    : emu_window{window}, stream_buffer(GL_ARRAY_BUFFER, STREAM_BUFFER_SIZE) {
     // Create sampler objects
     for (size_t i = 0; i < texture_samplers.size(); ++i) {
         texture_samplers[i].Create();
         state.texture_units[i].sampler = texture_samplers[i].sampler.handle;
     }
 
-    // Create SSBOs
-    for (size_t stage = 0; stage < ssbos.size(); ++stage) {
-        for (size_t buffer = 0; buffer < ssbos[stage].size(); ++buffer) {
-            ssbos[stage][buffer].Create();
-            state.draw.const_buffers[stage][buffer].ssbo = ssbos[stage][buffer].handle;
-        }
-    }
-
     GLint ext_num;
     glGetIntegerv(GL_NUM_EXTENSIONS, &ext_num);
     for (GLint i = 0; i < ext_num; i++) {
         const std::string_view extension{
             reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, i))};
 
-        if (extension == "GL_ARB_buffer_storage") {
-            has_ARB_buffer_storage = true;
-        } else if (extension == "GL_ARB_direct_state_access") {
+        if (extension == "GL_ARB_direct_state_access") {
             has_ARB_direct_state_access = true;
         } else if (extension == "GL_ARB_separate_shader_objects") {
             has_ARB_separate_shader_objects = true;
@@ -86,47 +77,31 @@ RasterizerOpenGL::RasterizerOpenGL(EmuWindow& window) : emu_window{window} {
 
     hw_vao.Create();
 
-    stream_buffer = OGLStreamBuffer::MakeBuffer(has_ARB_buffer_storage, GL_ARRAY_BUFFER);
-    stream_buffer->Create(STREAM_BUFFER_SIZE, STREAM_BUFFER_SIZE / 2);
-    state.draw.vertex_buffer = stream_buffer->GetHandle();
+    state.draw.vertex_buffer = stream_buffer.GetHandle();
 
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
     state.draw.shader_program = 0;
     state.draw.vertex_array = hw_vao.handle;
     state.Apply();
 
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, stream_buffer->GetHandle());
-
-    for (unsigned index = 0; index < uniform_buffers.size(); ++index) {
-        auto& buffer = uniform_buffers[index];
-        buffer.Create();
-        glBindBuffer(GL_UNIFORM_BUFFER, buffer.handle);
-        glBufferData(GL_UNIFORM_BUFFER, sizeof(GLShader::MaxwellUniformData), nullptr,
-                     GL_STREAM_COPY);
-        glBindBufferBase(GL_UNIFORM_BUFFER, index, buffer.handle);
-    }
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, stream_buffer.GetHandle());
 
     glEnable(GL_BLEND);
 
+    glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment);
+
     LOG_CRITICAL(Render_OpenGL, "Sync fixed function OpenGL state here!");
 }
 
-RasterizerOpenGL::~RasterizerOpenGL() {
-    if (stream_buffer != nullptr) {
-        state.draw.vertex_buffer = stream_buffer->GetHandle();
-        state.Apply();
-        stream_buffer->Release();
-    }
-}
+RasterizerOpenGL::~RasterizerOpenGL() {}
 
 std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
                                                              GLintptr buffer_offset) {
     MICROPROFILE_SCOPE(OpenGL_VAO);
     const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
-    const auto& memory_manager = Core::System::GetInstance().GPU().memory_manager;
 
     state.draw.vertex_array = hw_vao.handle;
-    state.draw.vertex_buffer = stream_buffer->GetHandle();
+    state.draw.vertex_buffer = stream_buffer.GetHandle();
     state.Apply();
 
     // Upload all guest vertex arrays sequentially to our buffer
@@ -141,16 +116,15 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
         ASSERT(end > start);
         u64 size = end - start + 1;
 
-        // Copy vertex array data
-        Memory::ReadBlock(*memory_manager->GpuToCpuAddress(start), array_ptr, size);
+        GLintptr vertex_buffer_offset;
+        std::tie(array_ptr, buffer_offset, vertex_buffer_offset) =
+            UploadMemory(array_ptr, buffer_offset, start, size);
 
         // Bind the vertex array to the buffer at the current offset.
-        glBindVertexBuffer(index, stream_buffer->GetHandle(), buffer_offset, vertex_array.stride);
+        glBindVertexBuffer(index, stream_buffer.GetHandle(), vertex_buffer_offset,
+                           vertex_array.stride);
 
         ASSERT_MSG(vertex_array.divisor == 0, "Vertex buffer divisor unimplemented");
-
-        array_ptr += size;
-        buffer_offset += size;
     }
 
     // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL.
@@ -161,11 +135,16 @@ std::pair<u8*, GLintptr> RasterizerOpenGL::SetupVertexArrays(u8* array_ptr,
     // assume every shader uses them all.
     for (unsigned index = 0; index < 16; ++index) {
         auto& attrib = regs.vertex_attrib_format[index];
-        LOG_DEBUG(HW_GPU, "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}",
+
+        // Ignore invalid attributes.
+        if (!attrib.IsValid())
+            continue;
+
+        auto& buffer = regs.vertex_array[attrib.buffer];
+        LOG_TRACE(HW_GPU, "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}",
                   index, attrib.ComponentCount(), attrib.SizeString(), attrib.TypeString(),
                   attrib.offset.Value(), attrib.IsNormalized());
 
-        auto& buffer = regs.vertex_array[attrib.buffer];
         ASSERT(buffer.IsEnabled());
 
         glEnableVertexAttribArray(index);
@@ -196,22 +175,12 @@ static GLShader::ProgramCode GetShaderProgramCode(Maxwell::ShaderProgram program
     return program_code;
 }
 
-void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
-    // Helper function for uploading uniform data
-    const auto copy_buffer = [&](GLuint handle, GLintptr offset, GLsizeiptr size) {
-        if (has_ARB_direct_state_access) {
-            glCopyNamedBufferSubData(stream_buffer->GetHandle(), handle, offset, 0, size);
-        } else {
-            glBindBuffer(GL_COPY_WRITE_BUFFER, handle);
-            glCopyBufferSubData(GL_ARRAY_BUFFER, GL_COPY_WRITE_BUFFER, offset, 0, size);
-        }
-    };
-
+std::pair<u8*, GLintptr> RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
     auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
 
     // Next available bindpoints to use when uploading the const buffers and textures to the GLSL
     // shaders. The constbuffer bindpoint starts after the shader stage configuration bind points.
-    u32 current_constbuffer_bindpoint = uniform_buffers.size();
+    u32 current_constbuffer_bindpoint = Tegra::Engines::Maxwell3D::Regs::MaxShaderStage;
     u32 current_texture_bindpoint = 0;
 
     for (size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@@ -223,22 +192,21 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
             continue;
         }
 
+        std::tie(buffer_ptr, buffer_offset) =
+            AlignBuffer(buffer_ptr, buffer_offset, static_cast<size_t>(uniform_buffer_alignment));
+
         const size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5
 
         GLShader::MaxwellUniformData ubo{};
         ubo.SetFromRegs(gpu.state.shader_stages[stage]);
         std::memcpy(buffer_ptr, &ubo, sizeof(ubo));
 
-        // Flush the buffer so that the GPU can see the data we just wrote.
-        glFlushMappedBufferRange(GL_ARRAY_BUFFER, buffer_offset, sizeof(ubo));
+        // Bind the buffer
+        glBindBufferRange(GL_UNIFORM_BUFFER, stage, stream_buffer.GetHandle(), buffer_offset,
+                          sizeof(ubo));
 
-        // Upload uniform data as one UBO per stage
-        const GLintptr ubo_offset = buffer_offset;
-        copy_buffer(uniform_buffers[stage].handle, ubo_offset,
-                    sizeof(GLShader::MaxwellUniformData));
-
-        buffer_ptr += sizeof(GLShader::MaxwellUniformData);
-        buffer_offset += sizeof(GLShader::MaxwellUniformData);
+        buffer_ptr += sizeof(ubo);
+        buffer_offset += sizeof(ubo);
 
         GLShader::ShaderSetup setup{GetShaderProgramCode(program)};
         GLShader::ShaderEntries shader_resources;
@@ -277,9 +245,9 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
             static_cast<Maxwell::ShaderStage>(stage));
 
         // Configure the const buffers for this shader stage.
-        current_constbuffer_bindpoint =
-            SetupConstBuffers(static_cast<Maxwell::ShaderStage>(stage), gl_stage_program,
-                              current_constbuffer_bindpoint, shader_resources.const_buffer_entries);
+        std::tie(buffer_ptr, buffer_offset, current_constbuffer_bindpoint) = SetupConstBuffers(
+            buffer_ptr, buffer_offset, static_cast<Maxwell::ShaderStage>(stage), gl_stage_program,
+            current_constbuffer_bindpoint, shader_resources.const_buffer_entries);
 
         // Configure the textures for this shader stage.
         current_texture_bindpoint =
@@ -294,6 +262,8 @@ void RasterizerOpenGL::SetupShaders(u8* buffer_ptr, GLintptr buffer_offset) {
     }
 
     shader_program_manager->UseTrivialGeometryShader();
+
+    return {buffer_ptr, buffer_offset};
 }
 
 size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
@@ -324,11 +294,14 @@ std::pair<Surface, Surface> RasterizerOpenGL::ConfigureFramebuffers(bool using_c
                                                                     bool using_depth_fb) {
     const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
 
+    if (regs.rt[0].format == Tegra::RenderTargetFormat::NONE) {
+        LOG_ERROR(HW_GPU, "RenderTargetFormat is not configured");
+        using_color_fb = false;
+    }
+
     // TODO(bunnei): Implement this
     const bool has_stencil = false;
 
-    const MathUtil::Rectangle<s32> viewport_rect{regs.viewport_transform[0].GetRect()};
-
     const bool write_color_fb =
         state.color_mask.red_enabled == GL_TRUE || state.color_mask.green_enabled == GL_TRUE ||
         state.color_mask.blue_enabled == GL_TRUE || state.color_mask.alpha_enabled == GL_TRUE;
@@ -341,9 +314,10 @@ std::pair<Surface, Surface> RasterizerOpenGL::ConfigureFramebuffers(bool using_c
     Surface depth_surface;
     MathUtil::Rectangle<u32> surfaces_rect;
     std::tie(color_surface, depth_surface, surfaces_rect) =
-        res_cache.GetFramebufferSurfaces(using_color_fb, using_depth_fb, viewport_rect);
+        res_cache.GetFramebufferSurfaces(using_color_fb, using_depth_fb);
 
-    MathUtil::Rectangle<u32> draw_rect{
+    const MathUtil::Rectangle<s32> viewport_rect{regs.viewport_transform[0].GetRect()};
+    const MathUtil::Rectangle<u32> draw_rect{
         static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.left) + viewport_rect.left,
                                          surfaces_rect.left, surfaces_rect.right)), // Left
         static_cast<u32>(std::clamp<s32>(static_cast<s32>(surfaces_rect.bottom) + viewport_rect.top,
@@ -423,6 +397,31 @@ void RasterizerOpenGL::Clear() {
     }
 }
 
+std::pair<u8*, GLintptr> RasterizerOpenGL::AlignBuffer(u8* buffer_ptr, GLintptr buffer_offset,
+                                                       size_t alignment) {
+    // Align the offset, not the mapped pointer
+    GLintptr offset_aligned =
+        static_cast<GLintptr>(Common::AlignUp(static_cast<size_t>(buffer_offset), alignment));
+    return {buffer_ptr + (offset_aligned - buffer_offset), offset_aligned};
+}
+
+std::tuple<u8*, GLintptr, GLintptr> RasterizerOpenGL::UploadMemory(u8* buffer_ptr,
+                                                                   GLintptr buffer_offset,
+                                                                   Tegra::GPUVAddr gpu_addr,
+                                                                   size_t size, size_t alignment) {
+    std::tie(buffer_ptr, buffer_offset) = AlignBuffer(buffer_ptr, buffer_offset, alignment);
+    GLintptr uploaded_offset = buffer_offset;
+
+    const auto& memory_manager = Core::System::GetInstance().GPU().memory_manager;
+    const boost::optional<VAddr> cpu_addr{memory_manager->GpuToCpuAddress(gpu_addr)};
+    Memory::ReadBlock(*cpu_addr, buffer_ptr, size);
+
+    buffer_ptr += size;
+    buffer_offset += size;
+
+    return {buffer_ptr, buffer_offset, uploaded_offset};
+}
+
 void RasterizerOpenGL::DrawArrays() {
     if (accelerate_draw == AccelDraw::Disabled)
         return;
@@ -447,7 +446,7 @@ void RasterizerOpenGL::DrawArrays() {
     const u64 index_buffer_size{regs.index_array.count * regs.index_array.FormatSizeInBytes()};
     const unsigned vertex_num{is_indexed ? regs.index_array.count : regs.vertex_buffer.count};
 
-    state.draw.vertex_buffer = stream_buffer->GetHandle();
+    state.draw.vertex_buffer = stream_buffer.GetHandle();
     state.Apply();
 
     size_t buffer_size = CalculateVertexArraysSize();
@@ -457,41 +456,31 @@ void RasterizerOpenGL::DrawArrays() {
     }
 
     // Uniform space for the 5 shader stages
-    buffer_size = Common::AlignUp<size_t>(buffer_size, 4) +
-                  sizeof(GLShader::MaxwellUniformData) * Maxwell::MaxShaderStage;
+    buffer_size =
+        Common::AlignUp<size_t>(buffer_size, 4) +
+        (sizeof(GLShader::MaxwellUniformData) + uniform_buffer_alignment) * Maxwell::MaxShaderStage;
+
+    // Add space for at least 18 constant buffers
+    buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment);
 
     u8* buffer_ptr;
     GLintptr buffer_offset;
-    std::tie(buffer_ptr, buffer_offset) =
-        stream_buffer->Map(static_cast<GLsizeiptr>(buffer_size), 4);
+    std::tie(buffer_ptr, buffer_offset, std::ignore) =
+        stream_buffer.Map(static_cast<GLsizeiptr>(buffer_size), 4);
+    u8* buffer_ptr_base = buffer_ptr;
 
-    u8* offseted_buffer;
-    std::tie(offseted_buffer, buffer_offset) = SetupVertexArrays(buffer_ptr, buffer_offset);
-
-    offseted_buffer =
-        reinterpret_cast<u8*>(Common::AlignUp(reinterpret_cast<size_t>(offseted_buffer), 4));
-    buffer_offset = Common::AlignUp<size_t>(buffer_offset, 4);
+    std::tie(buffer_ptr, buffer_offset) = SetupVertexArrays(buffer_ptr, buffer_offset);
 
     // If indexed mode, copy the index buffer
     GLintptr index_buffer_offset = 0;
     if (is_indexed) {
-        const auto& memory_manager = Core::System::GetInstance().GPU().memory_manager;
-        const boost::optional<VAddr> index_data_addr{
-            memory_manager->GpuToCpuAddress(regs.index_array.StartAddress())};
-        Memory::ReadBlock(*index_data_addr, offseted_buffer, index_buffer_size);
-
-        index_buffer_offset = buffer_offset;
-        offseted_buffer += index_buffer_size;
-        buffer_offset += index_buffer_size;
+        std::tie(buffer_ptr, buffer_offset, index_buffer_offset) = UploadMemory(
+            buffer_ptr, buffer_offset, regs.index_array.StartAddress(), index_buffer_size);
     }
 
-    offseted_buffer =
-        reinterpret_cast<u8*>(Common::AlignUp(reinterpret_cast<size_t>(offseted_buffer), 4));
-    buffer_offset = Common::AlignUp<size_t>(buffer_offset, 4);
-
-    SetupShaders(offseted_buffer, buffer_offset);
+    std::tie(buffer_ptr, buffer_offset) = SetupShaders(buffer_ptr, buffer_offset);
 
-    stream_buffer->Unmap();
+    stream_buffer.Unmap(buffer_ptr - buffer_ptr_base);
 
     shader_program_manager->ApplyTo(state);
     state.Apply();
@@ -638,32 +627,22 @@ void RasterizerOpenGL::SamplerInfo::SyncWithConfig(const Tegra::Texture::TSCEntr
     }
 }
 
-u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, GLuint program,
-                                        u32 current_bindpoint,
-                                        const std::vector<GLShader::ConstBufferEntry>& entries) {
+std::tuple<u8*, GLintptr, u32> RasterizerOpenGL::SetupConstBuffers(
+    u8* buffer_ptr, GLintptr buffer_offset, Maxwell::ShaderStage stage, GLuint program,
+    u32 current_bindpoint, const std::vector<GLShader::ConstBufferEntry>& entries) {
     const auto& gpu = Core::System::GetInstance().GPU();
     const auto& maxwell3d = gpu.Maxwell3D();
 
-    // Reset all buffer draw state for this stage.
-    for (auto& buffer : state.draw.const_buffers[static_cast<size_t>(stage)]) {
-        buffer.bindpoint = 0;
-        buffer.enabled = false;
-    }
-
     // Upload only the enabled buffers from the 16 constbuffers of each shader stage
     const auto& shader_stage = maxwell3d.state.shader_stages[static_cast<size_t>(stage)];
 
     for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
         const auto& used_buffer = entries[bindpoint];
         const auto& buffer = shader_stage.const_buffers[used_buffer.GetIndex()];
-        auto& buffer_draw_state =
-            state.draw.const_buffers[static_cast<size_t>(stage)][used_buffer.GetIndex()];
-
-        ASSERT_MSG(buffer.enabled, "Attempted to upload disabled constbuffer");
-        buffer_draw_state.enabled = true;
-        buffer_draw_state.bindpoint = current_bindpoint + bindpoint;
 
-        boost::optional<VAddr> addr = gpu.memory_manager->GpuToCpuAddress(buffer.address);
+        if (!buffer.enabled) {
+            continue;
+        }
 
         size_t size = 0;
 
@@ -686,25 +665,26 @@ u32 RasterizerOpenGL::SetupConstBuffers(Maxwell::ShaderStage stage, GLuint progr
         size = Common::AlignUp(size, sizeof(GLvec4));
         ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
 
-        std::vector<u8> data(size);
-        Memory::ReadBlock(*addr, data.data(), data.size());
+        GLintptr const_buffer_offset;
+        std::tie(buffer_ptr, buffer_offset, const_buffer_offset) =
+            UploadMemory(buffer_ptr, buffer_offset, buffer.address, size,
+                         static_cast<size_t>(uniform_buffer_alignment));
 
-        glBindBuffer(GL_UNIFORM_BUFFER, buffer_draw_state.ssbo);
-        glBufferData(GL_UNIFORM_BUFFER, data.size(), data.data(), GL_DYNAMIC_DRAW);
-        glBindBuffer(GL_UNIFORM_BUFFER, 0);
+        glBindBufferRange(GL_UNIFORM_BUFFER, current_bindpoint + bindpoint,
+                          stream_buffer.GetHandle(), const_buffer_offset, size);
 
         // Now configure the bindpoint of the buffer inside the shader
         const std::string buffer_name = used_buffer.GetName();
         const GLuint index =
             glGetProgramResourceIndex(program, GL_UNIFORM_BLOCK, buffer_name.c_str());
         if (index != GL_INVALID_INDEX) {
-            glUniformBlockBinding(program, index, buffer_draw_state.bindpoint);
+            glUniformBlockBinding(program, index, current_bindpoint + bindpoint);
         }
     }
 
     state.Apply();
 
-    return current_bindpoint + static_cast<u32>(entries.size());
+    return {buffer_ptr, buffer_offset, current_bindpoint + static_cast<u32>(entries.size())};
 }
 
 u32 RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, GLuint program, u32 current_unit,
@@ -804,9 +784,7 @@ void RasterizerOpenGL::SyncClipCoef() {
 void RasterizerOpenGL::SyncCullMode() {
     const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
 
-    // TODO(bunnei): Enable the below once more things work - until then, this may hide regressions
-    // state.cull.enabled = regs.cull.enabled != 0;
-    state.cull.enabled = false;
+    state.cull.enabled = regs.cull.enabled != 0;
 
     if (state.cull.enabled) {
         state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 6d6d85cc1..74307f626 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -7,6 +7,7 @@
 #include <array>
 #include <cstddef>
 #include <memory>
+#include <tuple>
 #include <utility>
 #include <vector>
 #include <glad/glad.h>
@@ -21,12 +22,15 @@
 #include "video_core/renderer_opengl/gl_state.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
-class EmuWindow;
 struct ScreenInfo;
 
+namespace Core::Frontend {
+class EmuWindow;
+}
+
 class RasterizerOpenGL : public VideoCore::RasterizerInterface {
 public:
-    explicit RasterizerOpenGL(EmuWindow& renderer);
+    explicit RasterizerOpenGL(Core::Frontend::EmuWindow& renderer);
     ~RasterizerOpenGL() override;
 
     void DrawArrays() override;
@@ -97,9 +101,10 @@ private:
      * @param entries Vector describing the buffers that are actually used in the guest shader.
      * @returns The next available bindpoint for use in the next shader stage.
      */
-    u32 SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, GLuint program,
-                          u32 current_bindpoint,
-                          const std::vector<GLShader::ConstBufferEntry>& entries);
+    std::tuple<u8*, GLintptr, u32> SetupConstBuffers(
+        u8* buffer_ptr, GLintptr buffer_offset, Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+        GLuint program, u32 current_bindpoint,
+        const std::vector<GLShader::ConstBufferEntry>& entries);
 
     /*
      * Configures the current textures to use for the draw command.
@@ -136,7 +141,6 @@ private:
     /// Syncs the blend state to match the guest state
     void SyncBlendState();
 
-    bool has_ARB_buffer_storage = false;
     bool has_ARB_direct_state_access = false;
     bool has_ARB_separate_shader_objects = false;
     bool has_ARB_vertex_attrib_binding = false;
@@ -145,29 +149,31 @@ private:
 
     RasterizerCacheOpenGL res_cache;
 
-    EmuWindow& emu_window;
+    Core::Frontend::EmuWindow& emu_window;
 
     std::unique_ptr<GLShader::ProgramManager> shader_program_manager;
     OGLVertexArray sw_vao;
     OGLVertexArray hw_vao;
 
     std::array<SamplerInfo, GLShader::NumTextureSamplers> texture_samplers;
-    std::array<std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers>,
-               Tegra::Engines::Maxwell3D::Regs::MaxShaderStage>
-        ssbos;
 
     static constexpr size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
-    std::unique_ptr<OGLStreamBuffer> stream_buffer;
+    OGLStreamBuffer stream_buffer;
     OGLBuffer uniform_buffer;
     OGLFramebuffer framebuffer;
+    GLint uniform_buffer_alignment;
 
     size_t CalculateVertexArraysSize() const;
 
     std::pair<u8*, GLintptr> SetupVertexArrays(u8* array_ptr, GLintptr buffer_offset);
 
-    std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::MaxShaderStage> uniform_buffers;
+    std::pair<u8*, GLintptr> SetupShaders(u8* buffer_ptr, GLintptr buffer_offset);
+
+    std::pair<u8*, GLintptr> AlignBuffer(u8* buffer_ptr, GLintptr buffer_offset, size_t alignment);
 
-    void SetupShaders(u8* buffer_ptr, GLintptr buffer_offset);
+    std::tuple<u8*, GLintptr, GLintptr> UploadMemory(u8* buffer_ptr, GLintptr buffer_offset,
+                                                     Tegra::GPUVAddr gpu_addr, size_t size,
+                                                     size_t alignment = 4);
 
     enum class AccelDraw { Disabled, Arrays, Indexed };
     AccelDraw accelerate_draw = AccelDraw::Disabled;
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index c8f0c4e28..5d58ebd4f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -46,6 +46,8 @@ struct FormatTuple {
     params.height = Common::AlignUp(config.tic.Height(), GetCompressionFactor(params.pixel_format));
     params.unaligned_height = config.tic.Height();
     params.size_in_bytes = params.SizeInBytes();
+    params.cache_width = Common::AlignUp(params.width, 16);
+    params.cache_height = Common::AlignUp(params.height, 16);
     return params;
 }
 
@@ -63,6 +65,8 @@ struct FormatTuple {
     params.height = config.height;
     params.unaligned_height = config.height;
     params.size_in_bytes = params.SizeInBytes();
+    params.cache_width = Common::AlignUp(params.width, 16);
+    params.cache_height = Common::AlignUp(params.height, 16);
     return params;
 }
 
@@ -82,17 +86,23 @@ struct FormatTuple {
     params.height = zeta_height;
     params.unaligned_height = zeta_height;
     params.size_in_bytes = params.SizeInBytes();
+    params.cache_width = Common::AlignUp(params.width, 16);
+    params.cache_height = Common::AlignUp(params.height, 16);
     return params;
 }
 
 static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_format_tuples = {{
-    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8
+    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // ABGR8U
+    {GL_RGBA8, GL_RGBA, GL_BYTE, ComponentType::SNorm, false},                     // ABGR8S
     {GL_RGB, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, ComponentType::UNorm, false},    // B5G6R5
     {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, ComponentType::UNorm,
      false}, // A2B10G10R10
     {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, ComponentType::UNorm, false}, // A1B5G5R5
     {GL_R8, GL_RED, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},                    // R8
+    {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, ComponentType::UInt, false},           // R8UI
     {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, ComponentType::Float, false},                 // RGBA16F
+    {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, ComponentType::UNorm, false},              // RGBA16U
+    {GL_RGBA16UI, GL_RGBA, GL_UNSIGNED_SHORT, ComponentType::UInt, false},             // RGBA16UI
     {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, ComponentType::Float,
      false},                                                                     // R11FG11FB10F
     {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false}, // RGBA32UI
@@ -103,7 +113,10 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
     {GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
      true},                                                                                 // DXT45
     {GL_COMPRESSED_RED_RGTC1, GL_RED, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm, true}, // DXN1
-    {GL_COMPRESSED_RGBA_BPTC_UNORM_ARB, GL_RGB, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
+    {GL_COMPRESSED_RG_RGTC2, GL_RG, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
+     true},                                                                     // DXN2UNORM
+    {GL_COMPRESSED_SIGNED_RG_RGTC2, GL_RG, GL_INT, ComponentType::SNorm, true}, // DXN2SNORM
+    {GL_COMPRESSED_RGBA_BPTC_UNORM_ARB, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8, ComponentType::UNorm,
      true},                                                                    // BC7U
     {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},        // ASTC_2D_4X4
     {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},            // G8R8
@@ -113,6 +126,9 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
     {GL_R32F, GL_RED, GL_FLOAT, ComponentType::Float, false},                  // R32F
     {GL_R16F, GL_RED, GL_HALF_FLOAT, ComponentType::Float, false},             // R16F
     {GL_R16, GL_RED, GL_UNSIGNED_SHORT, ComponentType::UNorm, false},          // R16UNORM
+    {GL_R16_SNORM, GL_RED, GL_SHORT, ComponentType::SNorm, false},             // R16S
+    {GL_R16UI, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ComponentType::UInt, false}, // R16UI
+    {GL_R16I, GL_RED_INTEGER, GL_SHORT, ComponentType::SInt, false},           // R16I
     {GL_RG16, GL_RG, GL_UNSIGNED_SHORT, ComponentType::UNorm, false},          // RG16
     {GL_RG16F, GL_RG, GL_HALF_FLOAT, ComponentType::Float, false},             // RG16F
     {GL_RG16UI, GL_RG_INTEGER, GL_UNSIGNED_SHORT, ComponentType::UInt, false}, // RG16UI
@@ -120,6 +136,10 @@ static constexpr std::array<FormatTuple, SurfaceParams::MaxPixelFormat> tex_form
     {GL_RG16_SNORM, GL_RG, GL_SHORT, ComponentType::SNorm, false},             // RG16S
     {GL_RGB32F, GL_RGB, GL_FLOAT, ComponentType::Float, false},                // RGB32F
     {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, ComponentType::UNorm, false}, // SRGBA8
+    {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},                       // RG8U
+    {GL_RG8, GL_RG, GL_BYTE, ComponentType::SNorm, false},                                // RG8S
+    {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false},              // RG32UI
+    {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, ComponentType::UInt, false},              // R32UI
 
     // DepthStencil formats
     {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, ComponentType::UNorm,
@@ -174,69 +194,121 @@ MathUtil::Rectangle<u32> SurfaceParams::GetRect() const {
     return {0, actual_height, width, 0};
 }
 
+/// Returns true if the specified PixelFormat is a BCn format, e.g. DXT or DXN
+static bool IsFormatBCn(PixelFormat format) {
+    switch (format) {
+    case PixelFormat::DXT1:
+    case PixelFormat::DXT23:
+    case PixelFormat::DXT45:
+    case PixelFormat::DXN1:
+    case PixelFormat::DXN2SNORM:
+    case PixelFormat::DXN2UNORM:
+    case PixelFormat::BC7U:
+        return true;
+    }
+    return false;
+}
+
 template <bool morton_to_gl, PixelFormat format>
-void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, Tegra::GPUVAddr addr) {
+void MortonCopy(u32 stride, u32 block_height, u32 height, std::vector<u8>& gl_buffer,
+                Tegra::GPUVAddr addr) {
     constexpr u32 bytes_per_pixel = SurfaceParams::GetFormatBpp(format) / CHAR_BIT;
     constexpr u32 gl_bytes_per_pixel = CachedSurface::GetGLBytesPerPixel(format);
     const auto& gpu = Core::System::GetInstance().GPU();
 
     if (morton_to_gl) {
-        if (SurfaceParams::GetFormatType(format) == SurfaceType::ColorTexture) {
-            auto data = Tegra::Texture::UnswizzleTexture(
-                *gpu.memory_manager->GpuToCpuAddress(addr),
-                SurfaceParams::TextureFormatFromPixelFormat(format), stride, height, block_height);
-            std::memcpy(gl_buffer, data.data(), data.size());
-        } else {
-            auto data = Tegra::Texture::UnswizzleDepthTexture(
-                *gpu.memory_manager->GpuToCpuAddress(addr),
-                SurfaceParams::DepthFormatFromPixelFormat(format), stride, height, block_height);
-            std::memcpy(gl_buffer, data.data(), data.size());
-        }
+        // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual
+        // pixel values.
+        const u32 tile_size{IsFormatBCn(format) ? 4U : 1U};
+        const std::vector<u8> data =
+            Tegra::Texture::UnswizzleTexture(*gpu.memory_manager->GpuToCpuAddress(addr), tile_size,
+                                             bytes_per_pixel, stride, height, block_height);
+        const size_t size_to_copy{std::min(gl_buffer.size(), data.size())};
+        gl_buffer.assign(data.begin(), data.begin() + size_to_copy);
     } else {
         // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should
         // check the configuration for this and perform more generic un/swizzle
         LOG_WARNING(Render_OpenGL, "need to use correct swizzle/GOB parameters!");
         VideoCore::MortonCopyPixels128(
             stride, height, bytes_per_pixel, gl_bytes_per_pixel,
-            Memory::GetPointer(*gpu.memory_manager->GpuToCpuAddress(addr)), gl_buffer,
+            Memory::GetPointer(*gpu.memory_manager->GpuToCpuAddress(addr)), gl_buffer.data(),
             morton_to_gl);
     }
 }
 
-static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr),
+static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, Tegra::GPUVAddr),
                             SurfaceParams::MaxPixelFormat>
     morton_to_gl_fns = {
-        MortonCopy<true, PixelFormat::ABGR8>,        MortonCopy<true, PixelFormat::B5G6R5>,
-        MortonCopy<true, PixelFormat::A2B10G10R10>,  MortonCopy<true, PixelFormat::A1B5G5R5>,
-        MortonCopy<true, PixelFormat::R8>,           MortonCopy<true, PixelFormat::RGBA16F>,
-        MortonCopy<true, PixelFormat::R11FG11FB10F>, MortonCopy<true, PixelFormat::RGBA32UI>,
-        MortonCopy<true, PixelFormat::DXT1>,         MortonCopy<true, PixelFormat::DXT23>,
-        MortonCopy<true, PixelFormat::DXT45>,        MortonCopy<true, PixelFormat::DXN1>,
-        MortonCopy<true, PixelFormat::BC7U>,         MortonCopy<true, PixelFormat::ASTC_2D_4X4>,
-        MortonCopy<true, PixelFormat::G8R8>,         MortonCopy<true, PixelFormat::BGRA8>,
-        MortonCopy<true, PixelFormat::RGBA32F>,      MortonCopy<true, PixelFormat::RG32F>,
-        MortonCopy<true, PixelFormat::R32F>,         MortonCopy<true, PixelFormat::R16F>,
-        MortonCopy<true, PixelFormat::R16UNORM>,     MortonCopy<true, PixelFormat::RG16>,
-        MortonCopy<true, PixelFormat::RG16F>,        MortonCopy<true, PixelFormat::RG16UI>,
-        MortonCopy<true, PixelFormat::RG16I>,        MortonCopy<true, PixelFormat::RG16S>,
-        MortonCopy<true, PixelFormat::RGB32F>,       MortonCopy<true, PixelFormat::SRGBA8>,
-        MortonCopy<true, PixelFormat::Z24S8>,        MortonCopy<true, PixelFormat::S8Z24>,
-        MortonCopy<true, PixelFormat::Z32F>,         MortonCopy<true, PixelFormat::Z16>,
+        // clang-format off
+        MortonCopy<true, PixelFormat::ABGR8U>,
+        MortonCopy<true, PixelFormat::ABGR8S>,
+        MortonCopy<true, PixelFormat::B5G6R5>,
+        MortonCopy<true, PixelFormat::A2B10G10R10>,
+        MortonCopy<true, PixelFormat::A1B5G5R5>,
+        MortonCopy<true, PixelFormat::R8>,
+        MortonCopy<true, PixelFormat::R8UI>,
+        MortonCopy<true, PixelFormat::RGBA16F>,
+        MortonCopy<true, PixelFormat::RGBA16U>,
+        MortonCopy<true, PixelFormat::RGBA16UI>,
+        MortonCopy<true, PixelFormat::R11FG11FB10F>,
+        MortonCopy<true, PixelFormat::RGBA32UI>,
+        MortonCopy<true, PixelFormat::DXT1>,
+        MortonCopy<true, PixelFormat::DXT23>,
+        MortonCopy<true, PixelFormat::DXT45>,
+        MortonCopy<true, PixelFormat::DXN1>,
+        MortonCopy<true, PixelFormat::DXN2UNORM>,
+        MortonCopy<true, PixelFormat::DXN2SNORM>,
+        MortonCopy<true, PixelFormat::BC7U>,
+        MortonCopy<true, PixelFormat::ASTC_2D_4X4>,
+        MortonCopy<true, PixelFormat::G8R8>,
+        MortonCopy<true, PixelFormat::BGRA8>,
+        MortonCopy<true, PixelFormat::RGBA32F>,
+        MortonCopy<true, PixelFormat::RG32F>,
+        MortonCopy<true, PixelFormat::R32F>,
+        MortonCopy<true, PixelFormat::R16F>,
+        MortonCopy<true, PixelFormat::R16UNORM>,
+        MortonCopy<true, PixelFormat::R16S>,
+        MortonCopy<true, PixelFormat::R16UI>,
+        MortonCopy<true, PixelFormat::R16I>,
+        MortonCopy<true, PixelFormat::RG16>,
+        MortonCopy<true, PixelFormat::RG16F>,
+        MortonCopy<true, PixelFormat::RG16UI>,
+        MortonCopy<true, PixelFormat::RG16I>,
+        MortonCopy<true, PixelFormat::RG16S>,
+        MortonCopy<true, PixelFormat::RGB32F>,
+        MortonCopy<true, PixelFormat::SRGBA8>,
+        MortonCopy<true, PixelFormat::RG8U>,
+        MortonCopy<true, PixelFormat::RG8S>,
+        MortonCopy<true, PixelFormat::RG32UI>,
+        MortonCopy<true, PixelFormat::R32UI>,
+        MortonCopy<true, PixelFormat::Z24S8>,
+        MortonCopy<true, PixelFormat::S8Z24>,
+        MortonCopy<true, PixelFormat::Z32F>,
+        MortonCopy<true, PixelFormat::Z16>,
         MortonCopy<true, PixelFormat::Z32FS8>,
+        // clang-format on
 };
 
-static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr),
+static constexpr std::array<void (*)(u32, u32, u32, std::vector<u8>&, Tegra::GPUVAddr),
                             SurfaceParams::MaxPixelFormat>
     gl_to_morton_fns = {
-        MortonCopy<false, PixelFormat::ABGR8>,
+        // clang-format off
+        MortonCopy<false, PixelFormat::ABGR8U>,
+        MortonCopy<false, PixelFormat::ABGR8S>,
         MortonCopy<false, PixelFormat::B5G6R5>,
         MortonCopy<false, PixelFormat::A2B10G10R10>,
         MortonCopy<false, PixelFormat::A1B5G5R5>,
         MortonCopy<false, PixelFormat::R8>,
+        MortonCopy<false, PixelFormat::R8UI>,
         MortonCopy<false, PixelFormat::RGBA16F>,
+        MortonCopy<false, PixelFormat::RGBA16U>,
+        MortonCopy<false, PixelFormat::RGBA16UI>,
         MortonCopy<false, PixelFormat::R11FG11FB10F>,
         MortonCopy<false, PixelFormat::RGBA32UI>,
-        // TODO(Subv): Swizzling DXT1/DXT23/DXT45/DXN1/BC7U/ASTC_2D_4X4 formats is not supported
+        // TODO(Subv): Swizzling DXT1/DXT23/DXT45/DXN1/DXN2/BC7U/ASTC_2D_4X4 formats is not
+        // supported
+        nullptr,
+        nullptr,
         nullptr,
         nullptr,
         nullptr,
@@ -250,6 +322,9 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr),
         MortonCopy<false, PixelFormat::R32F>,
         MortonCopy<false, PixelFormat::R16F>,
         MortonCopy<false, PixelFormat::R16UNORM>,
+        MortonCopy<false, PixelFormat::R16S>,
+        MortonCopy<false, PixelFormat::R16UI>,
+        MortonCopy<false, PixelFormat::R16I>,
         MortonCopy<false, PixelFormat::RG16>,
         MortonCopy<false, PixelFormat::RG16F>,
         MortonCopy<false, PixelFormat::RG16UI>,
@@ -257,11 +332,16 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, Tegra::GPUVAddr),
         MortonCopy<false, PixelFormat::RG16S>,
         MortonCopy<false, PixelFormat::RGB32F>,
         MortonCopy<false, PixelFormat::SRGBA8>,
+        MortonCopy<false, PixelFormat::RG8U>,
+        MortonCopy<false, PixelFormat::RG8S>,
+        MortonCopy<false, PixelFormat::RG32UI>,
+        MortonCopy<false, PixelFormat::R32UI>,
         MortonCopy<false, PixelFormat::Z24S8>,
         MortonCopy<false, PixelFormat::S8Z24>,
         MortonCopy<false, PixelFormat::Z32F>,
         MortonCopy<false, PixelFormat::Z16>,
         MortonCopy<false, PixelFormat::Z32FS8>,
+        // clang-format on
 };
 
 // Allocate an uninitialized texture of appropriate size and format for the surface
@@ -441,22 +521,24 @@ MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 64
 void CachedSurface::LoadGLBuffer() {
     ASSERT(params.type != SurfaceType::Fill);
 
-    u8* const texture_src_data = Memory::GetPointer(params.GetCpuAddr());
+    const u8* const texture_src_data = Memory::GetPointer(params.GetCpuAddr());
 
     ASSERT(texture_src_data);
 
-    gl_buffer.resize(params.width * params.height * GetGLBytesPerPixel(params.pixel_format));
+    const u32 bytes_per_pixel = GetGLBytesPerPixel(params.pixel_format);
+    const u32 copy_size = params.width * params.height * bytes_per_pixel;
 
     MICROPROFILE_SCOPE(OpenGL_SurfaceLoad);
 
-    if (!params.is_tiled) {
-        const u32 bytes_per_pixel{params.GetFormatBpp() >> 3};
+    if (params.is_tiled) {
+        gl_buffer.resize(copy_size);
 
-        std::memcpy(gl_buffer.data(), texture_src_data,
-                    bytes_per_pixel * params.width * params.height);
-    } else {
         morton_to_gl_fns[static_cast<size_t>(params.pixel_format)](
-            params.width, params.block_height, params.height, gl_buffer.data(), params.addr);
+            params.width, params.block_height, params.height, gl_buffer, params.addr);
+    } else {
+        const u8* const texture_src_data_end = texture_src_data + copy_size;
+
+        gl_buffer.assign(texture_src_data, texture_src_data_end);
     }
 
     ConvertFormatAsNeeded_LoadGLBuffer(gl_buffer, params.pixel_format, params.width, params.height);
@@ -479,7 +561,7 @@ void CachedSurface::FlushGLBuffer() {
         std::memcpy(dst_buffer, gl_buffer.data(), params.size_in_bytes);
     } else {
         gl_to_morton_fns[static_cast<size_t>(params.pixel_format)](
-            params.width, params.block_height, params.height, gl_buffer.data(), params.addr);
+            params.width, params.block_height, params.height, gl_buffer, params.addr);
     }
 }
 
@@ -594,8 +676,8 @@ Surface RasterizerCacheOpenGL::GetTextureSurface(const Tegra::Texture::FullTextu
     return GetSurface(SurfaceParams::CreateForTexture(config));
 }
 
-SurfaceSurfaceRect_Tuple RasterizerCacheOpenGL::GetFramebufferSurfaces(
-    bool using_color_fb, bool using_depth_fb, const MathUtil::Rectangle<s32>& viewport) {
+SurfaceSurfaceRect_Tuple RasterizerCacheOpenGL::GetFramebufferSurfaces(bool using_color_fb,
+                                                                       bool using_depth_fb) {
     const auto& regs = Core::System::GetInstance().GPU().Maxwell3D().regs;
 
     // TODO(bunnei): This is hard corded to use just the first render buffer
@@ -680,12 +762,12 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params) {
             // If use_accurate_framebuffers is enabled, always load from memory
             FlushSurface(surface);
             UnregisterSurface(surface);
-        } else if (surface->GetSurfaceParams() != params) {
-            // If surface parameters changed, recreate the surface from the old one
-            return RecreateSurface(surface, params);
-        } else {
+        } else if (surface->GetSurfaceParams().IsCompatibleSurface(params)) {
             // Use the cached surface as-is
             return surface;
+        } else {
+            // If surface parameters changed, recreate the surface from the old one
+            return RecreateSurface(surface, params);
         }
     }
 
@@ -751,10 +833,12 @@ void RasterizerCacheOpenGL::FlushRegion(Tegra::GPUVAddr /*addr*/, size_t /*size*
 }
 
 void RasterizerCacheOpenGL::InvalidateRegion(Tegra::GPUVAddr addr, size_t size) {
-    for (const auto& pair : surface_cache) {
-        const auto& surface{pair.second};
+    for (auto iter = surface_cache.cbegin(); iter != surface_cache.cend();) {
+        const auto& surface{iter->second};
         const auto& params{surface->GetSurfaceParams()};
 
+        ++iter;
+
         if (params.IsOverlappingRegion(addr, size)) {
             UnregisterSurface(surface);
         }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.h b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
index 4e1e18d9c..36a41522b 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -9,6 +9,7 @@
 #include <memory>
 #include <vector>
 #include <boost/icl/interval_map.hpp>
+
 #include "common/common_types.h"
 #include "common/math_util.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -22,43 +23,56 @@ using PageMap = boost::icl::interval_map<u64, int>;
 
 struct SurfaceParams {
     enum class PixelFormat {
-        ABGR8 = 0,
-        B5G6R5 = 1,
-        A2B10G10R10 = 2,
-        A1B5G5R5 = 3,
-        R8 = 4,
-        RGBA16F = 5,
-        R11FG11FB10F = 6,
-        RGBA32UI = 7,
-        DXT1 = 8,
-        DXT23 = 9,
-        DXT45 = 10,
-        DXN1 = 11, // This is also known as BC4
-        BC7U = 12,
-        ASTC_2D_4X4 = 13,
-        G8R8 = 14,
-        BGRA8 = 15,
-        RGBA32F = 16,
-        RG32F = 17,
-        R32F = 18,
-        R16F = 19,
-        R16UNORM = 20,
-        RG16 = 21,
-        RG16F = 22,
-        RG16UI = 23,
-        RG16I = 24,
-        RG16S = 25,
-        RGB32F = 26,
-        SRGBA8 = 27,
+        ABGR8U = 0,
+        ABGR8S = 1,
+        B5G6R5 = 2,
+        A2B10G10R10 = 3,
+        A1B5G5R5 = 4,
+        R8 = 5,
+        R8UI = 6,
+        RGBA16F = 7,
+        RGBA16U = 8,
+        RGBA16UI = 9,
+        R11FG11FB10F = 10,
+        RGBA32UI = 11,
+        DXT1 = 12,
+        DXT23 = 13,
+        DXT45 = 14,
+        DXN1 = 15, // This is also known as BC4
+        DXN2UNORM = 16,
+        DXN2SNORM = 17,
+        BC7U = 18,
+        ASTC_2D_4X4 = 19,
+        G8R8 = 20,
+        BGRA8 = 21,
+        RGBA32F = 22,
+        RG32F = 23,
+        R32F = 24,
+        R16F = 25,
+        R16UNORM = 26,
+        R16S = 27,
+        R16UI = 28,
+        R16I = 29,
+        RG16 = 30,
+        RG16F = 31,
+        RG16UI = 32,
+        RG16I = 33,
+        RG16S = 34,
+        RGB32F = 35,
+        SRGBA8 = 36,
+        RG8U = 37,
+        RG8S = 38,
+        RG32UI = 39,
+        R32UI = 40,
 
         MaxColorFormat,
 
         // DepthStencil formats
-        Z24S8 = 28,
-        S8Z24 = 29,
-        Z32F = 30,
-        Z16 = 31,
-        Z32FS8 = 32,
+        Z24S8 = 41,
+        S8Z24 = 42,
+        Z32F = 43,
+        Z16 = 44,
+        Z32FS8 = 45,
 
         MaxDepthStencilFormat,
 
@@ -96,18 +110,24 @@ struct SurfaceParams {
             return 0;
 
         constexpr std::array<u32, MaxPixelFormat> compression_factor_table = {{
-            1, // ABGR8
+            1, // ABGR8U
+            1, // ABGR8S
             1, // B5G6R5
             1, // A2B10G10R10
             1, // A1B5G5R5
             1, // R8
+            1, // R8UI
             1, // RGBA16F
+            1, // RGBA16U
+            1, // RGBA16UI
             1, // R11FG11FB10F
             1, // RGBA32UI
             4, // DXT1
             4, // DXT23
             4, // DXT45
             4, // DXN1
+            4, // DXN2UNORM
+            4, // DXN2SNORM
             4, // BC7U
             4, // ASTC_2D_4X4
             1, // G8R8
@@ -117,6 +137,9 @@ struct SurfaceParams {
             1, // R32F
             1, // R16F
             1, // R16UNORM
+            1, // R16S
+            1, // R16UI
+            1, // R16I
             1, // RG16
             1, // RG16F
             1, // RG16UI
@@ -124,6 +147,10 @@ struct SurfaceParams {
             1, // RG16S
             1, // RGB32F
             1, // SRGBA8
+            1, // RG8U
+            1, // RG8S
+            1, // RG32UI
+            1, // R32UI
             1, // Z24S8
             1, // S8Z24
             1, // Z32F
@@ -140,18 +167,24 @@ struct SurfaceParams {
             return 0;
 
         constexpr std::array<u32, MaxPixelFormat> bpp_table = {{
-            32,  // ABGR8
+            32,  // ABGR8U
+            32,  // ABGR8S
             16,  // B5G6R5
             32,  // A2B10G10R10
             16,  // A1B5G5R5
             8,   // R8
+            8,   // R8UI
             64,  // RGBA16F
+            64,  // RGBA16U
+            64,  // RGBA16UI
             32,  // R11FG11FB10F
             128, // RGBA32UI
             64,  // DXT1
             128, // DXT23
             128, // DXT45
             64,  // DXN1
+            128, // DXN2UNORM
+            128, // DXN2SNORM
             128, // BC7U
             32,  // ASTC_2D_4X4
             16,  // G8R8
@@ -161,6 +194,9 @@ struct SurfaceParams {
             32,  // R32F
             16,  // R16F
             16,  // R16UNORM
+            16,  // R16S
+            16,  // R16UI
+            16,  // R16I
             32,  // RG16
             32,  // RG16F
             32,  // RG16UI
@@ -168,6 +204,10 @@ struct SurfaceParams {
             32,  // RG16S
             96,  // RGB32F
             32,  // SRGBA8
+            16,  // RG8U
+            16,  // RG8S
+            64,  // RG32UI
+            32,  // R32UI
             32,  // Z24S8
             32,  // S8Z24
             32,  // Z32F
@@ -203,26 +243,37 @@ struct SurfaceParams {
 
     static PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) {
         switch (format) {
+        // TODO (Hexagon12): Converting SRGBA to RGBA is a hack and doesn't completely correct the
+        // gamma.
         case Tegra::RenderTargetFormat::RGBA8_SRGB:
-            return PixelFormat::SRGBA8;
         case Tegra::RenderTargetFormat::RGBA8_UNORM:
-            return PixelFormat::ABGR8;
+            return PixelFormat::ABGR8U;
+        case Tegra::RenderTargetFormat::RGBA8_SNORM:
+            return PixelFormat::ABGR8S;
         case Tegra::RenderTargetFormat::BGRA8_UNORM:
             return PixelFormat::BGRA8;
         case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
             return PixelFormat::A2B10G10R10;
         case Tegra::RenderTargetFormat::RGBA16_FLOAT:
             return PixelFormat::RGBA16F;
+        case Tegra::RenderTargetFormat::RGBA16_UNORM:
+            return PixelFormat::RGBA16U;
+        case Tegra::RenderTargetFormat::RGBA16_UINT:
+            return PixelFormat::RGBA16UI;
         case Tegra::RenderTargetFormat::RGBA32_FLOAT:
             return PixelFormat::RGBA32F;
         case Tegra::RenderTargetFormat::RG32_FLOAT:
             return PixelFormat::RG32F;
         case Tegra::RenderTargetFormat::R11G11B10_FLOAT:
             return PixelFormat::R11FG11FB10F;
+        case Tegra::RenderTargetFormat::B5G6R5_UNORM:
+            return PixelFormat::B5G6R5;
         case Tegra::RenderTargetFormat::RGBA32_UINT:
             return PixelFormat::RGBA32UI;
         case Tegra::RenderTargetFormat::R8_UNORM:
             return PixelFormat::R8;
+        case Tegra::RenderTargetFormat::R8_UINT:
+            return PixelFormat::R8UI;
         case Tegra::RenderTargetFormat::RG16_FLOAT:
             return PixelFormat::RG16F;
         case Tegra::RenderTargetFormat::RG16_UINT:
@@ -233,10 +284,26 @@ struct SurfaceParams {
             return PixelFormat::RG16;
         case Tegra::RenderTargetFormat::RG16_SNORM:
             return PixelFormat::RG16S;
+        case Tegra::RenderTargetFormat::RG8_UNORM:
+            return PixelFormat::RG8U;
+        case Tegra::RenderTargetFormat::RG8_SNORM:
+            return PixelFormat::RG8S;
         case Tegra::RenderTargetFormat::R16_FLOAT:
             return PixelFormat::R16F;
+        case Tegra::RenderTargetFormat::R16_UNORM:
+            return PixelFormat::R16UNORM;
+        case Tegra::RenderTargetFormat::R16_SNORM:
+            return PixelFormat::R16S;
+        case Tegra::RenderTargetFormat::R16_UINT:
+            return PixelFormat::R16UI;
+        case Tegra::RenderTargetFormat::R16_SINT:
+            return PixelFormat::R16I;
         case Tegra::RenderTargetFormat::R32_FLOAT:
             return PixelFormat::R32F;
+        case Tegra::RenderTargetFormat::R32_UINT:
+            return PixelFormat::R32UI;
+        case Tegra::RenderTargetFormat::RG32_UINT:
+            return PixelFormat::RG32UI;
         default:
             LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
@@ -248,7 +315,15 @@ struct SurfaceParams {
         // TODO(Subv): Properly implement this
         switch (format) {
         case Tegra::Texture::TextureFormat::A8R8G8B8:
-            return PixelFormat::ABGR8;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::UNORM:
+                return PixelFormat::ABGR8U;
+            case Tegra::Texture::ComponentType::SNORM:
+                return PixelFormat::ABGR8S;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::B5G6R5:
             return PixelFormat::B5G6R5;
         case Tegra::Texture::TextureFormat::A2B10G10R10:
@@ -256,7 +331,15 @@ struct SurfaceParams {
         case Tegra::Texture::TextureFormat::A1B5G5R5:
             return PixelFormat::A1B5G5R5;
         case Tegra::Texture::TextureFormat::R8:
-            return PixelFormat::R8;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::UNORM:
+                return PixelFormat::R8;
+            case Tegra::Texture::ComponentType::UINT:
+                return PixelFormat::R8UI;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::G8R8:
             return PixelFormat::G8R8;
         case Tegra::Texture::TextureFormat::R16_G16_B16_A16:
@@ -274,7 +357,15 @@ struct SurfaceParams {
                          static_cast<u32>(component_type));
             UNREACHABLE();
         case Tegra::Texture::TextureFormat::R32_G32:
-            return PixelFormat::RG32F;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::FLOAT:
+                return PixelFormat::RG32F;
+            case Tegra::Texture::ComponentType::UINT:
+                return PixelFormat::RG32UI;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::R32_G32_B32:
             return PixelFormat::RGB32F;
         case Tegra::Texture::TextureFormat::R16:
@@ -283,12 +374,26 @@ struct SurfaceParams {
                 return PixelFormat::R16F;
             case Tegra::Texture::ComponentType::UNORM:
                 return PixelFormat::R16UNORM;
+            case Tegra::Texture::ComponentType::SNORM:
+                return PixelFormat::R16S;
+            case Tegra::Texture::ComponentType::UINT:
+                return PixelFormat::R16UI;
+            case Tegra::Texture::ComponentType::SINT:
+                return PixelFormat::R16I;
             }
             LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
                          static_cast<u32>(component_type));
             UNREACHABLE();
         case Tegra::Texture::TextureFormat::R32:
-            return PixelFormat::R32F;
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::FLOAT:
+                return PixelFormat::R32F;
+            case Tegra::Texture::ComponentType::UINT:
+                return PixelFormat::R32UI;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::ZF32:
             return PixelFormat::Z32F;
         case Tegra::Texture::TextureFormat::Z24S8:
@@ -301,6 +406,16 @@ struct SurfaceParams {
             return PixelFormat::DXT45;
         case Tegra::Texture::TextureFormat::DXN1:
             return PixelFormat::DXN1;
+        case Tegra::Texture::TextureFormat::DXN2:
+            switch (component_type) {
+            case Tegra::Texture::ComponentType::UNORM:
+                return PixelFormat::DXN2UNORM;
+            case Tegra::Texture::ComponentType::SNORM:
+                return PixelFormat::DXN2SNORM;
+            }
+            LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}",
+                         static_cast<u32>(component_type));
+            UNREACHABLE();
         case Tegra::Texture::TextureFormat::BC7U:
             return PixelFormat::BC7U;
         case Tegra::Texture::TextureFormat::ASTC_2D_4X4:
@@ -328,89 +443,6 @@ struct SurfaceParams {
         }
     }
 
-    static Tegra::Texture::TextureFormat TextureFormatFromPixelFormat(PixelFormat format) {
-        // TODO(Subv): Properly implement this
-        switch (format) {
-        case PixelFormat::ABGR8:
-        case PixelFormat::SRGBA8:
-            return Tegra::Texture::TextureFormat::A8R8G8B8;
-        case PixelFormat::B5G6R5:
-            return Tegra::Texture::TextureFormat::B5G6R5;
-        case PixelFormat::A2B10G10R10:
-            return Tegra::Texture::TextureFormat::A2B10G10R10;
-        case PixelFormat::A1B5G5R5:
-            return Tegra::Texture::TextureFormat::A1B5G5R5;
-        case PixelFormat::R8:
-            return Tegra::Texture::TextureFormat::R8;
-        case PixelFormat::G8R8:
-            return Tegra::Texture::TextureFormat::G8R8;
-        case PixelFormat::RGBA16F:
-            return Tegra::Texture::TextureFormat::R16_G16_B16_A16;
-        case PixelFormat::R11FG11FB10F:
-            return Tegra::Texture::TextureFormat::BF10GF11RF11;
-        case PixelFormat::RGBA32UI:
-            return Tegra::Texture::TextureFormat::R32_G32_B32_A32;
-        case PixelFormat::DXT1:
-            return Tegra::Texture::TextureFormat::DXT1;
-        case PixelFormat::DXT23:
-            return Tegra::Texture::TextureFormat::DXT23;
-        case PixelFormat::DXT45:
-            return Tegra::Texture::TextureFormat::DXT45;
-        case PixelFormat::DXN1:
-            return Tegra::Texture::TextureFormat::DXN1;
-        case PixelFormat::BC7U:
-            return Tegra::Texture::TextureFormat::BC7U;
-        case PixelFormat::ASTC_2D_4X4:
-            return Tegra::Texture::TextureFormat::ASTC_2D_4X4;
-        case PixelFormat::BGRA8:
-            // TODO(bunnei): This is fine for unswizzling (since we just need the right component
-            // sizes), but could be a bug if we used this function in different ways.
-            return Tegra::Texture::TextureFormat::A8R8G8B8;
-        case PixelFormat::RGBA32F:
-            return Tegra::Texture::TextureFormat::R32_G32_B32_A32;
-        case PixelFormat::RGB32F:
-            return Tegra::Texture::TextureFormat::R32_G32_B32;
-        case PixelFormat::RG32F:
-            return Tegra::Texture::TextureFormat::R32_G32;
-        case PixelFormat::R32F:
-            return Tegra::Texture::TextureFormat::R32;
-        case PixelFormat::R16F:
-        case PixelFormat::R16UNORM:
-            return Tegra::Texture::TextureFormat::R16;
-        case PixelFormat::Z32F:
-            return Tegra::Texture::TextureFormat::ZF32;
-        case PixelFormat::Z24S8:
-            return Tegra::Texture::TextureFormat::Z24S8;
-        case PixelFormat::RG16F:
-        case PixelFormat::RG16:
-        case PixelFormat::RG16UI:
-        case PixelFormat::RG16I:
-        case PixelFormat::RG16S:
-            return Tegra::Texture::TextureFormat::R16_G16;
-        default:
-            LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
-            UNREACHABLE();
-        }
-    }
-
-    static Tegra::DepthFormat DepthFormatFromPixelFormat(PixelFormat format) {
-        switch (format) {
-        case PixelFormat::S8Z24:
-            return Tegra::DepthFormat::S8_Z24_UNORM;
-        case PixelFormat::Z24S8:
-            return Tegra::DepthFormat::Z24_S8_UNORM;
-        case PixelFormat::Z32F:
-            return Tegra::DepthFormat::Z32_FLOAT;
-        case PixelFormat::Z16:
-            return Tegra::DepthFormat::Z16_UNORM;
-        case PixelFormat::Z32FS8:
-            return Tegra::DepthFormat::Z32_S8_X24_FLOAT;
-        default:
-            LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
-            UNREACHABLE();
-        }
-    }
-
     static ComponentType ComponentTypeFromTexture(Tegra::Texture::ComponentType type) {
         // TODO(Subv): Implement more component types
         switch (type) {
@@ -439,8 +471,15 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::RGB10_A2_UNORM:
         case Tegra::RenderTargetFormat::R8_UNORM:
         case Tegra::RenderTargetFormat::RG16_UNORM:
+        case Tegra::RenderTargetFormat::R16_UNORM:
+        case Tegra::RenderTargetFormat::B5G6R5_UNORM:
+        case Tegra::RenderTargetFormat::RG8_UNORM:
+        case Tegra::RenderTargetFormat::RGBA16_UNORM:
             return ComponentType::UNorm;
+        case Tegra::RenderTargetFormat::RGBA8_SNORM:
         case Tegra::RenderTargetFormat::RG16_SNORM:
+        case Tegra::RenderTargetFormat::R16_SNORM:
+        case Tegra::RenderTargetFormat::RG8_SNORM:
             return ComponentType::SNorm;
         case Tegra::RenderTargetFormat::RGBA16_FLOAT:
         case Tegra::RenderTargetFormat::R11G11B10_FLOAT:
@@ -451,9 +490,15 @@ struct SurfaceParams {
         case Tegra::RenderTargetFormat::R32_FLOAT:
             return ComponentType::Float;
         case Tegra::RenderTargetFormat::RGBA32_UINT:
+        case Tegra::RenderTargetFormat::RGBA16_UINT:
         case Tegra::RenderTargetFormat::RG16_UINT:
+        case Tegra::RenderTargetFormat::R8_UINT:
+        case Tegra::RenderTargetFormat::R16_UINT:
+        case Tegra::RenderTargetFormat::RG32_UINT:
+        case Tegra::RenderTargetFormat::R32_UINT:
             return ComponentType::UInt;
         case Tegra::RenderTargetFormat::RG16_SINT:
+        case Tegra::RenderTargetFormat::R16_SINT:
             return ComponentType::SInt;
         default:
             LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
@@ -464,7 +509,7 @@ struct SurfaceParams {
     static PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat format) {
         switch (format) {
         case Tegra::FramebufferConfig::PixelFormat::ABGR8:
-            return PixelFormat::ABGR8;
+            return PixelFormat::ABGR8U;
         default:
             LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
             UNREACHABLE();
@@ -546,6 +591,12 @@ struct SurfaceParams {
         return !operator==(other);
     }
 
+    /// Checks if surfaces are compatible for caching
+    bool IsCompatibleSurface(const SurfaceParams& other) const {
+        return std::tie(pixel_format, type, cache_width, cache_height) ==
+               std::tie(other.pixel_format, other.type, other.cache_width, other.cache_height);
+    }
+
     Tegra::GPUVAddr addr;
     bool is_tiled;
     u32 block_height;
@@ -556,6 +607,10 @@ struct SurfaceParams {
     u32 height;
     u32 unaligned_height;
     size_t size_in_bytes;
+
+    // Parameters used for caching only
+    u32 cache_width;
+    u32 cache_height;
 };
 
 class CachedSurface final {
@@ -600,8 +655,7 @@ public:
     Surface GetTextureSurface(const Tegra::Texture::FullTextureInfo& config);
 
     /// Get the color and depth surfaces based on the framebuffer configuration
-    SurfaceSurfaceRect_Tuple GetFramebufferSurfaces(bool using_color_fb, bool using_depth_fb,
-                                                    const MathUtil::Rectangle<s32>& viewport);
+    SurfaceSurfaceRect_Tuple GetFramebufferSurfaces(bool using_color_fb, bool using_depth_fb);
 
     /// Flushes the surface to Switch memory
     void FlushSurface(const Surface& surface);
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index e3217db81..6834d7085 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -141,6 +141,15 @@ private:
                     ExitMethod jmp = Scan(target, end, labels);
                     return exit_method = ParallelExit(no_jmp, jmp);
                 }
+                case OpCode::Id::SSY: {
+                    // The SSY instruction uses a similar encoding as the BRA instruction.
+                    ASSERT_MSG(instr.bra.constant_buffer == 0,
+                               "Constant buffer SSY is not supported");
+                    u32 target = offset + instr.bra.GetBranchTarget();
+                    labels.insert(target);
+                    // Continue scanning for an exit method.
+                    break;
+                }
                 }
             }
         }
@@ -347,9 +356,14 @@ public:
      * @param reg The register to use as the source value.
      */
     void SetOutputAttributeToRegister(Attribute::Index attribute, u64 elem, const Register& reg) {
-        std::string dest = GetOutputAttribute(attribute) + GetSwizzle(elem);
+        std::string dest = GetOutputAttribute(attribute);
         std::string src = GetRegisterAsFloat(reg);
-        shader.AddLine(dest + " = " + src + ';');
+
+        if (!dest.empty()) {
+            // Can happen with unknown/unimplemented output attributes, in which case we ignore the
+            // instruction for now.
+            shader.AddLine(dest + GetSwizzle(elem) + " = " + src + ';');
+        }
     }
 
     /// Generates code representing a uniform (C buffer) register, interpreted as the input type.
@@ -362,6 +376,8 @@ public:
             return value;
         } else if (type == GLSLRegister::Type::Integer) {
             return "floatBitsToInt(" + value + ')';
+        } else if (type == GLSLRegister::Type::UnsignedInteger) {
+            return "floatBitsToUint(" + value + ')';
         } else {
             UNREACHABLE();
         }
@@ -507,6 +523,8 @@ private:
 
     /// Build the GLSL register list.
     void BuildRegisterList() {
+        regs.reserve(Register::NumRegisters);
+
         for (size_t index = 0; index < Register::NumRegisters; ++index) {
             regs.emplace_back(index, suffix);
         }
@@ -526,14 +544,17 @@ private:
         default:
             const u32 index{static_cast<u32>(attribute) -
                             static_cast<u32>(Attribute::Index::Attribute_0)};
-            if (attribute >= Attribute::Index::Attribute_0) {
+            if (attribute >= Attribute::Index::Attribute_0 &&
+                attribute <= Attribute::Index::Attribute_31) {
                 declr_input_attribute.insert(attribute);
                 return "input_attribute_" + std::to_string(index);
             }
 
-            LOG_CRITICAL(HW_GPU, "Unhandled input attribute: {}", index);
+            LOG_CRITICAL(HW_GPU, "Unhandled input attribute: {}", static_cast<u32>(attribute));
             UNREACHABLE();
         }
+
+        return "vec4(0, 0, 0, 0)";
     }
 
     /// Generates code representing an output attribute register.
@@ -551,6 +572,7 @@ private:
 
             LOG_CRITICAL(HW_GPU, "Unhandled output attribute: {}", index);
             UNREACHABLE();
+            return {};
         }
     }
 
@@ -602,12 +624,12 @@ private:
 
     /// Generates code representing a 19-bit immediate value
     static std::string GetImmediate19(const Instruction& instr) {
-        return std::to_string(instr.alu.GetImm20_19());
+        return fmt::format("uintBitsToFloat({})", instr.alu.GetImm20_19());
     }
 
     /// Generates code representing a 32-bit immediate value
     static std::string GetImmediate32(const Instruction& instr) {
-        return std::to_string(instr.alu.GetImm20_32());
+        return fmt::format("uintBitsToFloat({})", instr.alu.GetImm20_32());
     }
 
     /// Generates code representing a texture sampler.
@@ -650,16 +672,17 @@ private:
      * @param instr Instruction to generate the if condition for.
      * @returns string containing the predicate condition.
      */
-    std::string GetPredicateCondition(u64 index, bool negate) const {
+    std::string GetPredicateCondition(u64 index, bool negate) {
         using Tegra::Shader::Pred;
         std::string variable;
 
         // Index 7 is used as an 'Always True' condition.
-        if (index == static_cast<u64>(Pred::UnusedIndex))
+        if (index == static_cast<u64>(Pred::UnusedIndex)) {
             variable = "true";
-        else
+        } else {
             variable = 'p' + std::to_string(index) + '_' + suffix;
-
+            declr_predicates.insert(variable);
+        }
         if (negate) {
             return "!(" + variable + ')';
         }
@@ -818,7 +841,11 @@ private:
         ASSERT_MSG(instr.pred.full_pred != Pred::NeverExecute,
                    "NeverExecute predicate not implemented");
 
-        if (instr.pred.pred_index != static_cast<u64>(Pred::UnusedIndex)) {
+        // Some instructions (like SSY) don't have a predicate field, they are always
+        // unconditionally executed.
+        bool can_be_predicated = OpCode::IsPredicatedInstruction(opcode->GetId());
+
+        if (can_be_predicated && instr.pred.pred_index != static_cast<u64>(Pred::UnusedIndex)) {
             shader.AddLine("if (" +
                            GetPredicateCondition(instr.pred.pred_index, instr.negate_pred != 0) +
                            ')');
@@ -1605,6 +1632,99 @@ private:
             }
             break;
         }
+        case OpCode::Type::Xmad: {
+            ASSERT_MSG(!instr.xmad.sign_a, "Unimplemented");
+            ASSERT_MSG(!instr.xmad.sign_b, "Unimplemented");
+
+            std::string op_a{regs.GetRegisterAsInteger(instr.gpr8, 0, instr.xmad.sign_a)};
+            std::string op_b;
+            std::string op_c;
+
+            // TODO(bunnei): Needs to be fixed once op_a or op_b is signed
+            ASSERT_MSG(instr.xmad.sign_a == instr.xmad.sign_b, "Unimplemented");
+            const bool is_signed{instr.xmad.sign_a == 1};
+
+            bool is_merge{};
+            switch (opcode->GetId()) {
+            case OpCode::Id::XMAD_CR: {
+                is_merge = instr.xmad.merge_56;
+                op_b += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                        instr.xmad.sign_b ? GLSLRegister::Type::Integer
+                                                          : GLSLRegister::Type::UnsignedInteger);
+                op_c += regs.GetRegisterAsInteger(instr.gpr39, 0, is_signed);
+                break;
+            }
+            case OpCode::Id::XMAD_RR: {
+                is_merge = instr.xmad.merge_37;
+                op_b += regs.GetRegisterAsInteger(instr.gpr20, 0, instr.xmad.sign_b);
+                op_c += regs.GetRegisterAsInteger(instr.gpr39, 0, is_signed);
+                break;
+            }
+            case OpCode::Id::XMAD_RC: {
+                op_b += regs.GetRegisterAsInteger(instr.gpr39, 0, instr.xmad.sign_b);
+                op_c += regs.GetUniform(instr.cbuf34.index, instr.cbuf34.offset,
+                                        is_signed ? GLSLRegister::Type::Integer
+                                                  : GLSLRegister::Type::UnsignedInteger);
+                break;
+            }
+            case OpCode::Id::XMAD_IMM: {
+                is_merge = instr.xmad.merge_37;
+                op_b += std::to_string(instr.xmad.imm20_16);
+                op_c += regs.GetRegisterAsInteger(instr.gpr39, 0, is_signed);
+                break;
+            }
+            default: {
+                LOG_CRITICAL(HW_GPU, "Unhandled XMAD instruction: {}", opcode->GetName());
+                UNREACHABLE();
+            }
+            }
+
+            // TODO(bunnei): Ensure this is right with signed operands
+            if (instr.xmad.high_a) {
+                op_a = "((" + op_a + ") >> 16)";
+            } else {
+                op_a = "((" + op_a + ") & 0xFFFF)";
+            }
+
+            std::string src2 = '(' + op_b + ')'; // Preserve original source 2
+            if (instr.xmad.high_b) {
+                op_b = '(' + src2 + " >> 16)";
+            } else {
+                op_b = '(' + src2 + " & 0xFFFF)";
+            }
+
+            std::string product = '(' + op_a + " * " + op_b + ')';
+            if (instr.xmad.product_shift_left) {
+                product = '(' + product + " << 16)";
+            }
+
+            switch (instr.xmad.mode) {
+            case Tegra::Shader::XmadMode::None:
+                break;
+            case Tegra::Shader::XmadMode::CLo:
+                op_c = "((" + op_c + ") & 0xFFFF)";
+                break;
+            case Tegra::Shader::XmadMode::CHi:
+                op_c = "((" + op_c + ") >> 16)";
+                break;
+            case Tegra::Shader::XmadMode::CBcc:
+                op_c = "((" + op_c + ") + (" + src2 + "<< 16))";
+                break;
+            default: {
+                LOG_CRITICAL(HW_GPU, "Unhandled XMAD mode: {}",
+                             static_cast<u32>(instr.xmad.mode.Value()));
+                UNREACHABLE();
+            }
+            }
+
+            std::string sum{'(' + product + " + " + op_c + ')'};
+            if (is_merge) {
+                sum = "((" + sum + " & 0xFFFF) | (" + src2 + "<< 16))";
+            }
+
+            regs.SetRegisterToInteger(instr.gpr0, is_signed, 0, sum, 1, 1);
+            break;
+        }
         default: {
             switch (opcode->GetId()) {
             case OpCode::Id::EXIT: {
@@ -1642,7 +1762,15 @@ private:
             }
             case OpCode::Id::KIL: {
                 ASSERT(instr.flow.cond == Tegra::Shader::FlowCondition::Always);
+
+                // Enclose "discard" in a conditional, so that GLSL compilation does not complain
+                // about unexecuted instructions that may follow this.
+                shader.AddLine("if (true) {");
+                ++shader.scope;
                 shader.AddLine("discard;");
+                --shader.scope;
+                shader.AddLine("}");
+
                 break;
             }
             case OpCode::Id::BRA: {
@@ -1658,16 +1786,25 @@ private:
                 break;
             }
             case OpCode::Id::SSY: {
-                // The SSY opcode tells the GPU where to re-converge divergent execution paths, we
-                // can ignore this when generating GLSL code.
+                // The SSY opcode tells the GPU where to re-converge divergent execution paths, it
+                // sets the target of the jump that the SYNC instruction will make. The SSY opcode
+                // has a similar structure to the BRA opcode.
+                ASSERT_MSG(instr.bra.constant_buffer == 0, "Constant buffer SSY is not supported");
+
+                u32 target = offset + instr.bra.GetBranchTarget();
+                shader.AddLine("ssy_target = " + std::to_string(target) + "u;");
                 break;
             }
-            case OpCode::Id::SYNC:
+            case OpCode::Id::SYNC: {
+                // The SYNC opcode jumps to the address previously set by the SSY opcode
                 ASSERT(instr.flow.cond == Tegra::Shader::FlowCondition::Always);
+                shader.AddLine("{ jmp_to = ssy_target; break; }");
+                break;
+            }
             case OpCode::Id::DEPBAR: {
-                // TODO(Subv): Find out if we actually have to care about these instructions or if
+                // TODO(Subv): Find out if we actually have to care about this instruction or if
                 // the GLSL compiler takes care of that for us.
-                LOG_WARNING(HW_GPU, "DEPBAR/SYNC instruction is stubbed");
+                LOG_WARNING(HW_GPU, "DEPBAR instruction is stubbed");
                 break;
             }
             default: {
@@ -1681,7 +1818,7 @@ private:
         }
 
         // Close the predicate condition scope.
-        if (instr.pred.pred_index != static_cast<u64>(Pred::UnusedIndex)) {
+        if (can_be_predicated && instr.pred.pred_index != static_cast<u64>(Pred::UnusedIndex)) {
             --shader.scope;
             shader.AddLine('}');
         }
@@ -1732,6 +1869,7 @@ private:
             } else {
                 labels.insert(subroutine.begin);
                 shader.AddLine("uint jmp_to = " + std::to_string(subroutine.begin) + "u;");
+                shader.AddLine("uint ssy_target = 0u;");
                 shader.AddLine("while (true) {");
                 ++shader.scope;
 
@@ -1747,7 +1885,7 @@ private:
                     u32 compile_end = CompileRange(label, next_label);
                     if (compile_end > next_label && compile_end != PROGRAM_END) {
                         // This happens only when there is a label inside a IF/LOOP block
-                        shader.AddLine("{ jmp_to = " + std::to_string(compile_end) + "u; break; }");
+                        shader.AddLine(" jmp_to = " + std::to_string(compile_end) + "u; break; }");
                         labels.emplace(compile_end);
                     }
 
diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp
index 68bacd4c5..1d1975179 100644
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -203,21 +203,6 @@ void OpenGLState::Apply() const {
         }
     }
 
-    // Constbuffers
-    for (std::size_t stage = 0; stage < draw.const_buffers.size(); ++stage) {
-        for (std::size_t buffer_id = 0; buffer_id < draw.const_buffers[stage].size(); ++buffer_id) {
-            const auto& current = cur_state.draw.const_buffers[stage][buffer_id];
-            const auto& new_state = draw.const_buffers[stage][buffer_id];
-
-            if (current.enabled != new_state.enabled || current.bindpoint != new_state.bindpoint ||
-                current.ssbo != new_state.ssbo) {
-                if (new_state.enabled) {
-                    glBindBufferBase(GL_UNIFORM_BUFFER, new_state.bindpoint, new_state.ssbo);
-                }
-            }
-        }
-    }
-
     // Framebuffer
     if (draw.read_framebuffer != cur_state.draw.read_framebuffer) {
         glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer);
diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h
index 24b1d956b..bdb02ba25 100644
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -7,6 +7,10 @@
 #include <array>
 #include <glad/glad.h>
 
+#include "video_core/engines/maxwell_3d.h"
+
+using Regs = Tegra::Engines::Maxwell3D::Regs;
+
 namespace TextureUnits {
 
 struct TextureUnit {
@@ -115,12 +119,6 @@ public:
         GLuint uniform_buffer;   // GL_UNIFORM_BUFFER_BINDING
         GLuint shader_program;   // GL_CURRENT_PROGRAM
         GLuint program_pipeline; // GL_PROGRAM_PIPELINE_BINDING
-        struct ConstBufferConfig {
-            bool enabled = false;
-            GLuint bindpoint;
-            GLuint ssbo;
-        };
-        std::array<std::array<ConstBufferConfig, 16>, 5> const_buffers{};
     } draw;
 
     struct {
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index a2713e9f0..03a8ed8b7 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -9,174 +9,91 @@
 #include "video_core/renderer_opengl/gl_state.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
 
-class OrphanBuffer : public OGLStreamBuffer {
-public:
-    explicit OrphanBuffer(GLenum target) : OGLStreamBuffer(target) {}
-    ~OrphanBuffer() override;
-
-private:
-    void Create(size_t size, size_t sync_subdivide) override;
-    void Release() override;
-
-    std::pair<u8*, GLintptr> Map(size_t size, size_t alignment) override;
-    void Unmap() override;
-
-    std::vector<u8> data;
-};
-
-class StorageBuffer : public OGLStreamBuffer {
-public:
-    explicit StorageBuffer(GLenum target) : OGLStreamBuffer(target) {}
-    ~StorageBuffer() override;
-
-private:
-    void Create(size_t size, size_t sync_subdivide) override;
-    void Release() override;
-
-    std::pair<u8*, GLintptr> Map(size_t size, size_t alignment) override;
-    void Unmap() override;
-
-    struct Fence {
-        OGLSync sync;
-        size_t offset;
-    };
-    std::deque<Fence> head;
-    std::deque<Fence> tail;
-
-    u8* mapped_ptr;
-};
-
-OGLStreamBuffer::OGLStreamBuffer(GLenum target) {
-    gl_target = target;
-}
-
-GLuint OGLStreamBuffer::GetHandle() const {
-    return gl_buffer.handle;
-}
+OGLStreamBuffer::OGLStreamBuffer(GLenum target, GLsizeiptr size, bool prefer_coherent)
+    : gl_target(target), buffer_size(size) {
+    gl_buffer.Create();
+    glBindBuffer(gl_target, gl_buffer.handle);
 
-std::unique_ptr<OGLStreamBuffer> OGLStreamBuffer::MakeBuffer(bool storage_buffer, GLenum target) {
-    if (storage_buffer) {
-        return std::make_unique<StorageBuffer>(target);
+    GLsizeiptr allocate_size = size;
+    if (target == GL_ARRAY_BUFFER) {
+        // On AMD GPU there is a strange crash in indexed drawing. The crash happens when the buffer
+        // read position is near the end and is an out-of-bound access to the vertex buffer. This is
+        // probably a bug in the driver and is related to the usage of vec3<byte> attributes in the
+        // vertex array. Doubling the allocation size for the vertex buffer seems to avoid the
+        // crash.
+        allocate_size *= 2;
     }
-    return std::make_unique<OrphanBuffer>(target);
-}
 
-OrphanBuffer::~OrphanBuffer() {
-    Release();
+    if (GLAD_GL_ARB_buffer_storage) {
+        persistent = true;
+        coherent = prefer_coherent;
+        GLbitfield flags =
+            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
+        glBufferStorage(gl_target, allocate_size, nullptr, flags);
+        mapped_ptr = static_cast<u8*>(glMapBufferRange(
+            gl_target, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
+    } else {
+        glBufferData(gl_target, allocate_size, nullptr, GL_STREAM_DRAW);
+    }
 }
 
-void OrphanBuffer::Create(size_t size, size_t /*sync_subdivide*/) {
-    buffer_pos = 0;
-    buffer_size = size;
-    data.resize(buffer_size);
-
-    if (gl_buffer.handle == 0) {
-        gl_buffer.Create();
+OGLStreamBuffer::~OGLStreamBuffer() {
+    if (persistent) {
         glBindBuffer(gl_target, gl_buffer.handle);
+        glUnmapBuffer(gl_target);
     }
-
-    glBufferData(gl_target, static_cast<GLsizeiptr>(buffer_size), nullptr, GL_STREAM_DRAW);
-}
-
-void OrphanBuffer::Release() {
     gl_buffer.Release();
 }
 
-std::pair<u8*, GLintptr> OrphanBuffer::Map(size_t size, size_t alignment) {
-    buffer_pos = Common::AlignUp(buffer_pos, alignment);
-
-    if (buffer_pos + size > buffer_size) {
-        Create(std::max(buffer_size, size), 0);
-    }
-
-    mapped_size = size;
-    return std::make_pair(&data[buffer_pos], static_cast<GLintptr>(buffer_pos));
-}
-
-void OrphanBuffer::Unmap() {
-    glBufferSubData(gl_target, static_cast<GLintptr>(buffer_pos),
-                    static_cast<GLsizeiptr>(mapped_size), &data[buffer_pos]);
-    buffer_pos += mapped_size;
-}
-
-StorageBuffer::~StorageBuffer() {
-    Release();
+GLuint OGLStreamBuffer::GetHandle() const {
+    return gl_buffer.handle;
 }
 
-void StorageBuffer::Create(size_t size, size_t sync_subdivide) {
-    if (gl_buffer.handle != 0)
-        return;
-
-    buffer_pos = 0;
-    buffer_size = size;
-    buffer_sync_subdivide = std::max<size_t>(sync_subdivide, 1);
-
-    gl_buffer.Create();
-    glBindBuffer(gl_target, gl_buffer.handle);
-
-    glBufferStorage(gl_target, static_cast<GLsizeiptr>(buffer_size), nullptr,
-                    GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT);
-    mapped_ptr = reinterpret_cast<u8*>(
-        glMapBufferRange(gl_target, 0, static_cast<GLsizeiptr>(buffer_size),
-                         GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_FLUSH_EXPLICIT_BIT));
+GLsizeiptr OGLStreamBuffer::GetSize() const {
+    return buffer_size;
 }
 
-void StorageBuffer::Release() {
-    if (gl_buffer.handle == 0)
-        return;
-
-    glUnmapBuffer(gl_target);
-
-    gl_buffer.Release();
-    head.clear();
-    tail.clear();
-}
-
-std::pair<u8*, GLintptr> StorageBuffer::Map(size_t size, size_t alignment) {
+std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
     ASSERT(size <= buffer_size);
+    ASSERT(alignment <= buffer_size);
+    mapped_size = size;
 
-    OGLSync sync;
-
-    buffer_pos = Common::AlignUp(buffer_pos, alignment);
-    size_t effective_offset = Common::AlignDown(buffer_pos, buffer_sync_subdivide);
-
-    if (!head.empty() &&
-        (effective_offset > head.back().offset || buffer_pos + size > buffer_size)) {
-        ASSERT(head.back().sync.handle == 0);
-        head.back().sync.Create();
+    if (alignment > 0) {
+        buffer_pos = Common::AlignUp<size_t>(buffer_pos, alignment);
     }
 
+    bool invalidate = false;
     if (buffer_pos + size > buffer_size) {
-        if (!tail.empty()) {
-            std::swap(sync, tail.back().sync);
-            tail.clear();
-        }
-        std::swap(tail, head);
         buffer_pos = 0;
-        effective_offset = 0;
-    }
+        invalidate = true;
 
-    while (!tail.empty() && buffer_pos + size > tail.front().offset) {
-        std::swap(sync, tail.front().sync);
-        tail.pop_front();
+        if (persistent) {
+            glUnmapBuffer(gl_target);
+        }
     }
 
-    if (sync.handle != 0) {
-        glClientWaitSync(sync.handle, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
-        sync.Release();
+    if (invalidate | !persistent) {
+        GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
+                           (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
+                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
+        mapped_ptr = static_cast<u8*>(
+            glMapBufferRange(gl_target, buffer_pos, buffer_size - buffer_pos, flags));
+        mapped_offset = buffer_pos;
     }
 
-    if (head.empty() || effective_offset > head.back().offset) {
-        head.emplace_back();
-        head.back().offset = effective_offset;
+    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+}
+
+void OGLStreamBuffer::Unmap(GLsizeiptr size) {
+    ASSERT(size <= mapped_size);
+
+    if (!coherent && size > 0) {
+        glFlushMappedBufferRange(gl_target, buffer_pos - mapped_offset, size);
     }
 
-    mapped_size = size;
-    return std::make_pair(&mapped_ptr[buffer_pos], static_cast<GLintptr>(buffer_pos));
-}
+    if (!persistent) {
+        glUnmapBuffer(gl_target);
+    }
 
-void StorageBuffer::Unmap() {
-    glFlushMappedBufferRange(gl_target, static_cast<GLintptr>(buffer_pos),
-                             static_cast<GLsizeiptr>(mapped_size));
-    buffer_pos += mapped_size;
+    buffer_pos += size;
 }
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index e78dc5784..45592daaf 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -2,35 +2,41 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#pragma once
-
-#include <memory>
+#include <tuple>
 #include <glad/glad.h>
 #include "common/common_types.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLenum target);
-    virtual ~OGLStreamBuffer() = default;
-
-public:
-    static std::unique_ptr<OGLStreamBuffer> MakeBuffer(bool storage_buffer, GLenum target);
-
-    virtual void Create(size_t size, size_t sync_subdivide) = 0;
-    virtual void Release() {}
+    explicit OGLStreamBuffer(GLenum target, GLsizeiptr size, bool prefer_coherent = false);
+    ~OGLStreamBuffer();
 
     GLuint GetHandle() const;
+    GLsizeiptr GetSize() const;
+
+    /*
+     * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
+     * and the optional alignment requirement.
+     * If the buffer is full, the whole buffer is reallocated which invalidates old chunks.
+     * The return values are the pointer to the new chunk, the offset within the buffer,
+     * and the invalidation flag for previous chunks.
+     * The actual used size must be specified on unmapping the chunk.
+     */
+    std::tuple<u8*, GLintptr, bool> Map(GLsizeiptr size, GLintptr alignment = 0);
 
-    virtual std::pair<u8*, GLintptr> Map(size_t size, size_t alignment) = 0;
-    virtual void Unmap() = 0;
+    void Unmap(GLsizeiptr size);
 
-protected:
+private:
     OGLBuffer gl_buffer;
     GLenum gl_target;
 
-    size_t buffer_pos = 0;
-    size_t buffer_size = 0;
-    size_t buffer_sync_subdivide = 0;
-    size_t mapped_size = 0;
+    bool coherent = false;
+    bool persistent = false;
+
+    GLintptr buffer_pos = 0;
+    GLsizeiptr buffer_size = 0;
+    GLintptr mapped_offset = 0;
+    GLsizeiptr mapped_size = 0;
+    u8* mapped_ptr = nullptr;
 };
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 16b1bd606..83ea0cfc0 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -27,9 +27,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
     case Maxwell::VertexAttribute::Type::UnsignedNorm: {
 
         switch (attrib.size) {
+        case Maxwell::VertexAttribute::Size::Size_8:
+        case Maxwell::VertexAttribute::Size::Size_8_8:
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
             return GL_UNSIGNED_BYTE;
         case Maxwell::VertexAttribute::Size::Size_16_16:
+        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
             return GL_UNSIGNED_SHORT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_UNSIGNED_INT_2_10_10_10_REV;
@@ -43,6 +46,9 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
     case Maxwell::VertexAttribute::Type::SignedNorm: {
 
         switch (attrib.size) {
+        case Maxwell::VertexAttribute::Size::Size_32_32_32:
+            return GL_INT;
+        case Maxwell::VertexAttribute::Size::Size_8_8:
         case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
             return GL_BYTE;
         case Maxwell::VertexAttribute::Size::Size_16_16:
@@ -84,6 +90,10 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
 
 inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
     switch (topology) {
+    case Maxwell::PrimitiveTopology::Points:
+        return GL_POINTS;
+    case Maxwell::PrimitiveTopology::LineStrip:
+        return GL_LINE_STRIP;
     case Maxwell::PrimitiveTopology::Triangles:
         return GL_TRIANGLES;
     case Maxwell::PrimitiveTopology::TriangleStrip:
@@ -149,42 +159,61 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
 inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
     switch (factor) {
     case Maxwell::Blend::Factor::Zero:
+    case Maxwell::Blend::Factor::ZeroGL:
         return GL_ZERO;
     case Maxwell::Blend::Factor::One:
+    case Maxwell::Blend::Factor::OneGL:
         return GL_ONE;
     case Maxwell::Blend::Factor::SourceColor:
+    case Maxwell::Blend::Factor::SourceColorGL:
         return GL_SRC_COLOR;
     case Maxwell::Blend::Factor::OneMinusSourceColor:
+    case Maxwell::Blend::Factor::OneMinusSourceColorGL:
         return GL_ONE_MINUS_SRC_COLOR;
     case Maxwell::Blend::Factor::SourceAlpha:
+    case Maxwell::Blend::Factor::SourceAlphaGL:
         return GL_SRC_ALPHA;
     case Maxwell::Blend::Factor::OneMinusSourceAlpha:
+    case Maxwell::Blend::Factor::OneMinusSourceAlphaGL:
         return GL_ONE_MINUS_SRC_ALPHA;
     case Maxwell::Blend::Factor::DestAlpha:
+    case Maxwell::Blend::Factor::DestAlphaGL:
         return GL_DST_ALPHA;
     case Maxwell::Blend::Factor::OneMinusDestAlpha:
+    case Maxwell::Blend::Factor::OneMinusDestAlphaGL:
         return GL_ONE_MINUS_DST_ALPHA;
     case Maxwell::Blend::Factor::DestColor:
+    case Maxwell::Blend::Factor::DestColorGL:
         return GL_DST_COLOR;
     case Maxwell::Blend::Factor::OneMinusDestColor:
+    case Maxwell::Blend::Factor::OneMinusDestColorGL:
         return GL_ONE_MINUS_DST_COLOR;
     case Maxwell::Blend::Factor::SourceAlphaSaturate:
+    case Maxwell::Blend::Factor::SourceAlphaSaturateGL:
         return GL_SRC_ALPHA_SATURATE;
     case Maxwell::Blend::Factor::Source1Color:
+    case Maxwell::Blend::Factor::Source1ColorGL:
         return GL_SRC1_COLOR;
     case Maxwell::Blend::Factor::OneMinusSource1Color:
+    case Maxwell::Blend::Factor::OneMinusSource1ColorGL:
         return GL_ONE_MINUS_SRC1_COLOR;
     case Maxwell::Blend::Factor::Source1Alpha:
+    case Maxwell::Blend::Factor::Source1AlphaGL:
         return GL_SRC1_ALPHA;
     case Maxwell::Blend::Factor::OneMinusSource1Alpha:
+    case Maxwell::Blend::Factor::OneMinusSource1AlphaGL:
         return GL_ONE_MINUS_SRC1_ALPHA;
     case Maxwell::Blend::Factor::ConstantColor:
+    case Maxwell::Blend::Factor::ConstantColorGL:
         return GL_CONSTANT_COLOR;
     case Maxwell::Blend::Factor::OneMinusConstantColor:
+    case Maxwell::Blend::Factor::OneMinusConstantColorGL:
         return GL_ONE_MINUS_CONSTANT_COLOR;
     case Maxwell::Blend::Factor::ConstantAlpha:
+    case Maxwell::Blend::Factor::ConstantAlphaGL:
         return GL_CONSTANT_ALPHA;
     case Maxwell::Blend::Factor::OneMinusConstantAlpha:
+    case Maxwell::Blend::Factor::OneMinusConstantAlphaGL:
         return GL_ONE_MINUS_CONSTANT_ALPHA;
     }
     LOG_CRITICAL(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor));
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index bf9131193..95f1aa0fe 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -18,7 +18,6 @@
 #include "core/tracer/recorder.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
 #include "video_core/utils.h"
-#include "video_core/video_core.h"
 
 static const char vertex_shader[] = R"(
 #version 150 core
@@ -92,7 +91,8 @@ static std::array<GLfloat, 3 * 2> MakeOrthographicMatrix(const float width, cons
     return matrix;
 }
 
-ScopeAcquireGLContext::ScopeAcquireGLContext(EmuWindow& emu_window_) : emu_window{emu_window_} {
+ScopeAcquireGLContext::ScopeAcquireGLContext(Core::Frontend::EmuWindow& emu_window_)
+    : emu_window{emu_window_} {
     if (Settings::values.use_multi_core) {
         emu_window.MakeCurrent();
     }
@@ -103,7 +103,9 @@ ScopeAcquireGLContext::~ScopeAcquireGLContext() {
     }
 }
 
-RendererOpenGL::RendererOpenGL(EmuWindow& window) : VideoCore::RendererBase{window} {}
+RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& window)
+    : VideoCore::RendererBase{window} {}
+
 RendererOpenGL::~RendererOpenGL() = default;
 
 /// Swap buffers (render frame)
@@ -430,7 +432,7 @@ static void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum
         break;
     case GL_DEBUG_SEVERITY_NOTIFICATION:
     case GL_DEBUG_SEVERITY_LOW:
-        LOG_DEBUG(Render_OpenGL, format, str_source, str_type, id, message);
+        LOG_TRACE(Render_OpenGL, format, str_source, str_type, id, message);
         break;
     }
 }
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 428afa3b7..a5eab6997 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -12,7 +12,9 @@
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_state.h"
 
+namespace Core::Frontend {
 class EmuWindow;
+}
 
 /// Structure used for storing information about the textures for the Switch screen
 struct TextureInfo {
@@ -34,16 +36,16 @@ struct ScreenInfo {
 /// Helper class to acquire/release OpenGL context within a given scope
 class ScopeAcquireGLContext : NonCopyable {
 public:
-    explicit ScopeAcquireGLContext(EmuWindow& window);
+    explicit ScopeAcquireGLContext(Core::Frontend::EmuWindow& window);
     ~ScopeAcquireGLContext();
 
 private:
-    EmuWindow& emu_window;
+    Core::Frontend::EmuWindow& emu_window;
 };
 
 class RendererOpenGL : public VideoCore::RendererBase {
 public:
-    explicit RendererOpenGL(EmuWindow& window);
+    explicit RendererOpenGL(Core::Frontend::EmuWindow& window);
     ~RendererOpenGL() override;
 
     /// Swap buffers (render frame)
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 65db84ad3..70746a34e 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -54,6 +54,7 @@ u32 BytesPerPixel(TextureFormat format) {
         return 8;
     case TextureFormat::DXT23:
     case TextureFormat::DXT45:
+    case TextureFormat::DXN2:
     case TextureFormat::BC7U:
         // In this case a 'pixel' actually refers to a 4x4 tile.
         return 16;
@@ -85,87 +86,11 @@ u32 BytesPerPixel(TextureFormat format) {
     }
 }
 
-static u32 DepthBytesPerPixel(DepthFormat format) {
-    switch (format) {
-    case DepthFormat::Z16_UNORM:
-        return 2;
-    case DepthFormat::S8_Z24_UNORM:
-    case DepthFormat::Z24_S8_UNORM:
-    case DepthFormat::Z32_FLOAT:
-        return 4;
-    case DepthFormat::Z32_S8_X24_FLOAT:
-        return 8;
-    default:
-        UNIMPLEMENTED_MSG("Format not implemented");
-        break;
-    }
-}
-
-std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width, u32 height,
-                                 u32 block_height) {
-    u8* data = Memory::GetPointer(address);
-    u32 bytes_per_pixel = BytesPerPixel(format);
-
+std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size, u32 bytes_per_pixel, u32 width,
+                                 u32 height, u32 block_height) {
     std::vector<u8> unswizzled_data(width * height * bytes_per_pixel);
-
-    switch (format) {
-    case TextureFormat::DXT1:
-    case TextureFormat::DXT23:
-    case TextureFormat::DXT45:
-    case TextureFormat::DXN1:
-    case TextureFormat::BC7U:
-        // In the DXT and DXN formats, each 4x4 tile is swizzled instead of just individual pixel
-        // values.
-        CopySwizzledData(width / 4, height / 4, bytes_per_pixel, bytes_per_pixel, data,
-                         unswizzled_data.data(), true, block_height);
-        break;
-    case TextureFormat::A8R8G8B8:
-    case TextureFormat::A2B10G10R10:
-    case TextureFormat::A1B5G5R5:
-    case TextureFormat::B5G6R5:
-    case TextureFormat::R8:
-    case TextureFormat::G8R8:
-    case TextureFormat::R16_G16_B16_A16:
-    case TextureFormat::R32_G32_B32_A32:
-    case TextureFormat::R32_G32:
-    case TextureFormat::R32:
-    case TextureFormat::R16:
-    case TextureFormat::R16_G16:
-    case TextureFormat::BF10GF11RF11:
-    case TextureFormat::ASTC_2D_4X4:
-    case TextureFormat::R32_G32_B32:
-        CopySwizzledData(width, height, bytes_per_pixel, bytes_per_pixel, data,
-                         unswizzled_data.data(), true, block_height);
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Format not implemented");
-        break;
-    }
-
-    return unswizzled_data;
-}
-
-std::vector<u8> UnswizzleDepthTexture(VAddr address, DepthFormat format, u32 width, u32 height,
-                                      u32 block_height) {
-    u8* data = Memory::GetPointer(address);
-    u32 bytes_per_pixel = DepthBytesPerPixel(format);
-
-    std::vector<u8> unswizzled_data(width * height * bytes_per_pixel);
-
-    switch (format) {
-    case DepthFormat::Z16_UNORM:
-    case DepthFormat::S8_Z24_UNORM:
-    case DepthFormat::Z24_S8_UNORM:
-    case DepthFormat::Z32_FLOAT:
-    case DepthFormat::Z32_S8_X24_FLOAT:
-        CopySwizzledData(width, height, bytes_per_pixel, bytes_per_pixel, data,
-                         unswizzled_data.data(), true, block_height);
-        break;
-    default:
-        UNIMPLEMENTED_MSG("Format not implemented");
-        break;
-    }
-
+    CopySwizzledData(width / tile_size, height / tile_size, bytes_per_pixel, bytes_per_pixel,
+                     Memory::GetPointer(address), unswizzled_data.data(), true, block_height);
     return unswizzled_data;
 }
 
@@ -179,6 +104,7 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat
     case TextureFormat::DXT23:
     case TextureFormat::DXT45:
     case TextureFormat::DXN1:
+    case TextureFormat::DXN2:
     case TextureFormat::BC7U:
     case TextureFormat::ASTC_2D_4X4:
     case TextureFormat::A8R8G8B8:
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index 73a4924d1..1f7b731be 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -13,8 +13,8 @@ namespace Tegra::Texture {
 /**
  * Unswizzles a swizzled texture without changing its format.
  */
-std::vector<u8> UnswizzleTexture(VAddr address, TextureFormat format, u32 width, u32 height,
-                                 u32 block_height = TICEntry::DefaultBlockHeight);
+std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size, u32 bytes_per_pixel, u32 width,
+                                 u32 height, u32 block_height = TICEntry::DefaultBlockHeight);
 
 /**
  * Unswizzles a swizzled depth texture without changing its format.
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index 5085ef96b..6780d1c16 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -9,9 +9,7 @@
 
 namespace VideoCore {
 
-std::atomic<bool> g_toggle_framelimit_enabled;
-
-std::unique_ptr<RendererBase> CreateRenderer(EmuWindow& emu_window) {
+std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window) {
     return std::make_unique<RendererOpenGL>(emu_window);
 }
 
diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h
index 7c01c0b8d..f79f85dfe 100644
--- a/src/video_core/video_core.h
+++ b/src/video_core/video_core.h
@@ -4,27 +4,22 @@
 
 #pragma once
 
-#include <atomic>
 #include <memory>
 
+namespace Core::Frontend {
 class EmuWindow;
+}
 
 namespace VideoCore {
 
 class RendererBase;
 
-enum class Renderer { Software, OpenGL };
-
-// TODO: Wrap these in a user settings struct along with any other graphics settings (often set from
-// qt ui)
-extern std::atomic<bool> g_toggle_framelimit_enabled;
-
 /**
  * Creates a renderer instance.
  *
  * @note The returned renderer instance is simply allocated. Its Init()
  *       function still needs to be called to fully complete its setup.
  */
-std::unique_ptr<RendererBase> CreateRenderer(EmuWindow& emu_window);
+std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window);
 
 } // namespace VideoCore