61 files changed, 2825 insertions, 802 deletions
diff --git a/.gitmodules b/.gitmodules
index 2ec9dda62..9ba8fe207 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,6 +13,9 @@
 [submodule "soundtouch"]
     path = externals/soundtouch
     url = https://github.com/citra-emu/ext-soundtouch.git
+[submodule "libressl"]
+    path = externals/libressl
+    url = https://github.com/citra-emu/ext-libressl-portable.git
 [submodule "discord-rpc"]
     path = externals/discord-rpc
     url = https://github.com/discordapp/discord-rpc.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a9f669d56..b71071271 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,7 +152,6 @@ macro(yuzu_find_packages)
         "Boost             1.71        boost/1.72.0"
         "Catch2            2.11        catch2/2.11.0"
         "fmt               6.2         fmt/6.2.0"
-        "OpenSSL           1.1         openssl/1.1.1f"
     # can't use until https://github.com/bincrafters/community/issues/1173
         #"libzip            1.5         libzip/1.5.2@bincrafters/stable"
         "lz4               1.8         lz4/1.9.2"
@@ -312,15 +311,6 @@ elseif (TARGET Boost::boost)
     add_library(boost ALIAS Boost::boost)
 endif()
 
-if (NOT TARGET OpenSSL::SSL)
-    set_target_properties(OpenSSL::OpenSSL PROPERTIES IMPORTED_GLOBAL TRUE)
-    add_library(OpenSSL::SSL ALIAS OpenSSL::OpenSSL)
-endif()
-if (NOT TARGET OpenSSL::Crypto)
-    set_target_properties(OpenSSL::OpenSSL PROPERTIES IMPORTED_GLOBAL TRUE)
-    add_library(OpenSSL::Crypto ALIAS OpenSSL::OpenSSL)
-endif()
-
 if (TARGET sdl2::sdl2)
     # imported from the conan generated sdl2Config.cmake
     set_target_properties(sdl2::sdl2 PROPERTIES IMPORTED_GLOBAL TRUE)
diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake
index 83e4e9df2..311ba1c2e 100644
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -51,6 +51,8 @@ endif()
 # The variable SRC_DIR must be passed into the script (since it uses the current build directory for all values of CMAKE_*_DIR)
 set(VIDEO_CORE "${SRC_DIR}/src/video_core")
 set(HASH_FILES
+    "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.cpp"
+    "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.h"
     "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp"
     "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h"
     "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp"
diff --git a/dist/qt_themes/qdarkstyle/style.qss b/dist/qt_themes/qdarkstyle/style.qss
index 7d088a719..2d5c9761f 100644
--- a/dist/qt_themes/qdarkstyle/style.qss
+++ b/dist/qt_themes/qdarkstyle/style.qss
@@ -673,10 +673,6 @@ QTabWidget::pane {
     border-bottom-left-radius: 2px;
 }
 
-QTabWidget::tab-bar {
-    overflow: visible;
-}
-
 QTabBar {
     qproperty-drawBase: 0;
     border-radius: 3px;
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index df7a5e0a9..b80b27605 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -4,6 +4,13 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMakeModules")
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/externals/find-modules")
 include(DownloadExternals)
 
+# xbyak
+if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
+    add_library(xbyak INTERFACE)
+    target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
+    target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
+endif()
+
 # Catch
 add_library(catch-single-include INTERFACE)
 target_include_directories(catch-single-include INTERFACE catch/single_include)
@@ -66,6 +73,15 @@ if (NOT LIBZIP_FOUND)
 endif()
 
 if (ENABLE_WEB_SERVICE)
+    # LibreSSL
+    set(LIBRESSL_SKIP_INSTALL ON CACHE BOOL "")
+    add_subdirectory(libressl EXCLUDE_FROM_ALL)
+    target_include_directories(ssl INTERFACE ./libressl/include)
+    target_compile_definitions(ssl PRIVATE -DHAVE_INET_NTOP)
+    get_directory_property(OPENSSL_LIBRARIES
+        DIRECTORY libressl
+        DEFINITION OPENSSL_LIBS)
+
     # lurlparser
     add_subdirectory(lurlparser EXCLUDE_FROM_ALL)
 
@@ -73,13 +89,5 @@ if (ENABLE_WEB_SERVICE)
     add_library(httplib INTERFACE)
     target_include_directories(httplib INTERFACE ./httplib)
     target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT)
-    target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto)
-endif()
-
-if (NOT TARGET xbyak)
-    if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
-        add_library(xbyak INTERFACE)
-        target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
-        target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
-    endif()
+    target_link_libraries(httplib INTERFACE ${OPENSSL_LIBRARIES})
 endif()
diff --git a/externals/libressl b/externals/libressl
new file mode 160000
+Subproject 7d01cb01cb1a926ecb4c9c98b107ef3c26f59df
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 24b7a083c..0a3e2f4d1 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -32,6 +32,8 @@ add_custom_command(OUTPUT scm_rev.cpp
     DEPENDS
       # WARNING! It was too much work to try and make a common location for this list,
       # so if you need to change it, please update CMakeModules/GenerateSCMRev.cmake as well
+      "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.cpp"
+      "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.h"
       "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp"
       "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h"
       "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp"
diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h
index 794da8a52..a5f5d4fc1 100644
--- a/src/common/x64/xbyak_abi.h
+++ b/src/common/x64/xbyak_abi.h
@@ -11,7 +11,7 @@
 
 namespace Common::X64 {
 
-inline int RegToIndex(const Xbyak::Reg& reg) {
+inline std::size_t RegToIndex(const Xbyak::Reg& reg) {
     using Kind = Xbyak::Reg::Kind;
     ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
                "RegSet only support GPRs and XMM registers.");
@@ -19,17 +19,17 @@ inline int RegToIndex(const Xbyak::Reg& reg) {
     return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16);
 }
 
-inline Xbyak::Reg64 IndexToReg64(int reg_index) {
+inline Xbyak::Reg64 IndexToReg64(std::size_t reg_index) {
     ASSERT(reg_index < 16);
-    return Xbyak::Reg64(reg_index);
+    return Xbyak::Reg64(static_cast<int>(reg_index));
 }
 
-inline Xbyak::Xmm IndexToXmm(int reg_index) {
+inline Xbyak::Xmm IndexToXmm(std::size_t reg_index) {
     ASSERT(reg_index >= 16 && reg_index < 32);
-    return Xbyak::Xmm(reg_index - 16);
+    return Xbyak::Xmm(static_cast<int>(reg_index - 16));
 }
 
-inline Xbyak::Reg IndexToReg(int reg_index) {
+inline Xbyak::Reg IndexToReg(std::size_t reg_index) {
     if (reg_index < 16) {
         return IndexToReg64(reg_index);
     } else {
@@ -151,9 +151,13 @@ constexpr size_t ABI_SHADOW_SPACE = 0;
 
 #endif
 
-inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
-                                   size_t needed_frame_size, s32* out_subtraction,
-                                   s32* out_xmm_offset) {
+struct ABIFrameInfo {
+    s32 subtraction;
+    s32 xmm_offset;
+};
+
+inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
+                                           size_t needed_frame_size) {
     const auto count = (regs & ABI_ALL_GPRS).count();
     rsp_alignment -= count * 8;
     size_t subtraction = 0;
@@ -170,33 +174,28 @@ inline void ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
     rsp_alignment -= subtraction;
     subtraction += rsp_alignment & 0xF;
 
-    *out_subtraction = (s32)subtraction;
-    *out_xmm_offset = (s32)(subtraction - xmm_base_subtraction);
+    return ABIFrameInfo{static_cast<s32>(subtraction),
+                        static_cast<s32>(subtraction - xmm_base_subtraction)};
 }
 
 inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
                                               size_t rsp_alignment, size_t needed_frame_size = 0) {
-    s32 subtraction, xmm_offset;
-    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+    auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
+
     for (std::size_t i = 0; i < regs.size(); ++i) {
         if (regs[i] && ABI_ALL_GPRS[i]) {
-            code.push(IndexToReg64(static_cast<int>(i)));
+            code.push(IndexToReg64(i));
         }
     }
-    if (subtraction != 0) {
-        code.sub(code.rsp, subtraction);
-    }
 
-    for (int i = 0; i < regs.count(); i++) {
-        if (regs.test(i) & ABI_ALL_GPRS.test(i)) {
-            code.push(IndexToReg64(i));
-        }
+    if (frame_info.subtraction != 0) {
+        code.sub(code.rsp, frame_info.subtraction);
     }
 
     for (std::size_t i = 0; i < regs.size(); ++i) {
         if (regs[i] && ABI_ALL_XMMS[i]) {
-            code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(static_cast<int>(i)));
-            xmm_offset += 0x10;
+            code.movaps(code.xword[code.rsp + frame_info.xmm_offset], IndexToXmm(i));
+            frame_info.xmm_offset += 0x10;
         }
     }
 
@@ -205,59 +204,23 @@ inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::b
 
 inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
                                            size_t rsp_alignment, size_t needed_frame_size = 0) {
-    s32 subtraction, xmm_offset;
-    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
+    auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
 
     for (std::size_t i = 0; i < regs.size(); ++i) {
         if (regs[i] && ABI_ALL_XMMS[i]) {
-            code.movaps(IndexToXmm(static_cast<int>(i)), code.xword[code.rsp + xmm_offset]);
-            xmm_offset += 0x10;
+            code.movaps(IndexToXmm(i), code.xword[code.rsp + frame_info.xmm_offset]);
+            frame_info.xmm_offset += 0x10;
         }
     }
 
-    if (subtraction != 0) {
-        code.add(code.rsp, subtraction);
+    if (frame_info.subtraction != 0) {
+        code.add(code.rsp, frame_info.subtraction);
     }
 
     // GPRs need to be popped in reverse order
-    for (int i = 15; i >= 0; i--) {
-        if (regs[i]) {
-            code.pop(IndexToReg64(i));
-        }
-    }
-}
-
-inline size_t ABI_PushRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
-                                                 size_t rsp_alignment,
-                                                 size_t needed_frame_size = 0) {
-    s32 subtraction, xmm_offset;
-    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
-
-    for (std::size_t i = 0; i < regs.size(); ++i) {
+    for (std::size_t j = 0; j < regs.size(); ++j) {
+        const std::size_t i = regs.size() - j - 1;
         if (regs[i] && ABI_ALL_GPRS[i]) {
-            code.push(IndexToReg64(static_cast<int>(i)));
-        }
-    }
-
-    if (subtraction != 0) {
-        code.sub(code.rsp, subtraction);
-    }
-
-    return ABI_SHADOW_SPACE;
-}
-
-inline void ABI_PopRegistersAndAdjustStackGPS(Xbyak::CodeGenerator& code, std::bitset<32> regs,
-                                              size_t rsp_alignment, size_t needed_frame_size = 0) {
-    s32 subtraction, xmm_offset;
-    ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset);
-
-    if (subtraction != 0) {
-        code.add(code.rsp, subtraction);
-    }
-
-    // GPRs need to be popped in reverse order
-    for (int i = 15; i >= 0; i--) {
-        if (regs[i]) {
             code.pop(IndexToReg64(i));
         }
     }
diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
index 9bc86e3b9..19d798dc7 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -50,7 +50,8 @@ public:
     }
 
     void InterpreterFallback(u32 pc, std::size_t num_instructions) override {
-        UNIMPLEMENTED();
+        UNIMPLEMENTED_MSG("This should never happen, pc = {:08X}, code = {:08X}", pc,
+                          MemoryReadCode(pc));
     }
 
     void ExceptionRaised(u32 pc, Dynarmic::A32::Exception exception) override {
@@ -89,8 +90,6 @@ public:
 
     ARM_Dynarmic_32& parent;
     std::size_t num_interpreted_instructions{};
-    u64 tpidrro_el0{};
-    u64 tpidr_el0{};
 };
 
 std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable& page_table,
@@ -99,7 +98,7 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable&
     config.callbacks = cb.get();
     // TODO(bunnei): Implement page table for 32-bit
     // config.page_table = &page_table.pointers;
-    config.coprocessors[15] = std::make_shared<DynarmicCP15>((u32*)&CP15_regs[0]);
+    config.coprocessors[15] = cp15;
     config.define_unpredictable_behaviour = true;
     return std::make_unique<Dynarmic::A32::Jit>(config);
 }
@@ -112,13 +111,13 @@ void ARM_Dynarmic_32::Run() {
 }
 
 void ARM_Dynarmic_32::Step() {
-    cb->InterpreterFallback(jit->Regs()[15], 1);
+    jit->Step();
 }
 
 ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor,
                                  std::size_t core_index)
-    : ARM_Interface{system},
-      cb(std::make_unique<DynarmicCallbacks32>(*this)), core_index{core_index},
+    : ARM_Interface{system}, cb(std::make_unique<DynarmicCallbacks32>(*this)),
+      cp15(std::make_shared<DynarmicCP15>(*this)), core_index{core_index},
       exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}
 
 ARM_Dynarmic_32::~ARM_Dynarmic_32() = default;
@@ -154,19 +153,19 @@ void ARM_Dynarmic_32::SetPSTATE(u32 cpsr) {
 }
 
 u64 ARM_Dynarmic_32::GetTlsAddress() const {
-    return CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)];
+    return cp15->uro;
 }
 
 void ARM_Dynarmic_32::SetTlsAddress(VAddr address) {
-    CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)] = static_cast<u32>(address);
+    cp15->uro = static_cast<u32>(address);
 }
 
 u64 ARM_Dynarmic_32::GetTPIDR_EL0() const {
-    return cb->tpidr_el0;
+    return cp15->uprw;
 }
 
 void ARM_Dynarmic_32::SetTPIDR_EL0(u64 value) {
-    cb->tpidr_el0 = value;
+    cp15->uprw = static_cast<u32>(value);
 }
 
 void ARM_Dynarmic_32::SaveContext(ThreadContext32& ctx) {
diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.h b/src/core/arm/dynarmic/arm_dynarmic_32.h
index 8ba9cea8f..e5b92d7bb 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_32.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.h
@@ -22,6 +22,7 @@ class Memory;
 namespace Core {
 
 class DynarmicCallbacks32;
+class DynarmicCP15;
 class DynarmicExclusiveMonitor;
 class System;
 
@@ -66,12 +67,14 @@ private:
         std::unordered_map<JitCacheKey, std::shared_ptr<Dynarmic::A32::Jit>, Common::PairHash>;
 
     friend class DynarmicCallbacks32;
+    friend class DynarmicCP15;
+
     std::unique_ptr<DynarmicCallbacks32> cb;
     JitCacheType jit_cache;
     std::shared_ptr<Dynarmic::A32::Jit> jit;
+    std::shared_ptr<DynarmicCP15> cp15;
     std::size_t core_index;
     DynarmicExclusiveMonitor& exclusive_monitor;
-    std::array<u32, 84> CP15_regs{};
 };
 
 } // namespace Core
diff --git a/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp
index 3fdcdebde..d43e4dd70 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp
@@ -2,79 +2,132 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <fmt/format.h>
+#include "common/logging/log.h"
+#include "core/arm/dynarmic/arm_dynarmic_32.h"
 #include "core/arm/dynarmic/arm_dynarmic_cp15.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/core_timing_util.h"
 
 using Callback = Dynarmic::A32::Coprocessor::Callback;
 using CallbackOrAccessOneWord = Dynarmic::A32::Coprocessor::CallbackOrAccessOneWord;
 using CallbackOrAccessTwoWords = Dynarmic::A32::Coprocessor::CallbackOrAccessTwoWords;
 
+template <>
+struct fmt::formatter<Dynarmic::A32::CoprocReg> {
+    constexpr auto parse(format_parse_context& ctx) {
+        return ctx.begin();
+    }
+    template <typename FormatContext>
+    auto format(const Dynarmic::A32::CoprocReg& reg, FormatContext& ctx) {
+        return format_to(ctx.out(), "cp{}", static_cast<size_t>(reg));
+    }
+};
+
+namespace Core {
+
+static u32 dummy_value;
+
 std::optional<Callback> DynarmicCP15::CompileInternalOperation(bool two, unsigned opc1,
                                                                CoprocReg CRd, CoprocReg CRn,
                                                                CoprocReg CRm, unsigned opc2) {
+    LOG_CRITICAL(Core_ARM, "CP15: cdp{} p15, {}, {}, {}, {}, {}", two ? "2" : "", opc1, CRd, CRn,
+                 CRm, opc2);
     return {};
 }
 
 CallbackOrAccessOneWord DynarmicCP15::CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn,
                                                          CoprocReg CRm, unsigned opc2) {
-    // TODO(merry): Privileged CP15 registers
-
     if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C5 && opc2 == 4) {
+        // CP15_FLUSH_PREFETCH_BUFFER
         // This is a dummy write, we ignore the value written here.
-        return &CP15[static_cast<std::size_t>(CP15Register::CP15_FLUSH_PREFETCH_BUFFER)];
+        return &dummy_value;
     }
 
     if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C10) {
         switch (opc2) {
         case 4:
+            // CP15_DATA_SYNC_BARRIER
             // This is a dummy write, we ignore the value written here.
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_SYNC_BARRIER)];
+            return &dummy_value;
         case 5:
+            // CP15_DATA_MEMORY_BARRIER
             // This is a dummy write, we ignore the value written here.
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_MEMORY_BARRIER)];
-        default:
-            return {};
+            return &dummy_value;
         }
     }
 
     if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0 && opc2 == 2) {
-        return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)];
+        // CP15_THREAD_UPRW
+        return &uprw;
     }
 
+    LOG_CRITICAL(Core_ARM, "CP15: mcr{} p15, {}, <Rt>, {}, {}, {}", two ? "2" : "", opc1, CRn, CRm,
+                 opc2);
     return {};
 }
 
 CallbackOrAccessTwoWords DynarmicCP15::CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) {
+    LOG_CRITICAL(Core_ARM, "CP15: mcrr{} p15, {}, <Rt>, <Rt2>, {}", two ? "2" : "", opc, CRm);
     return {};
 }
 
 CallbackOrAccessOneWord DynarmicCP15::CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn,
                                                         CoprocReg CRm, unsigned opc2) {
-    // TODO(merry): Privileged CP15 registers
-
     if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0) {
         switch (opc2) {
         case 2:
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)];
+            // CP15_THREAD_UPRW
+            return &uprw;
         case 3:
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)];
-        default:
-            return {};
+            // CP15_THREAD_URO
+            return &uro;
         }
     }
 
+    LOG_CRITICAL(Core_ARM, "CP15: mrc{} p15, {}, <Rt>, {}, {}, {}", two ? "2" : "", opc1, CRn, CRm,
+                 opc2);
     return {};
 }
 
 CallbackOrAccessTwoWords DynarmicCP15::CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) {
+    if (!two && opc == 0 && CRm == CoprocReg::C14) {
+        // CNTPCT
+        const auto callback = static_cast<u64 (*)(Dynarmic::A32::Jit*, void*, u32, u32)>(
+            [](Dynarmic::A32::Jit*, void* arg, u32, u32) -> u64 {
+                ARM_Dynarmic_32& parent = *(ARM_Dynarmic_32*)arg;
+                return Timing::CpuCyclesToClockCycles(parent.system.CoreTiming().GetTicks());
+            });
+        return Dynarmic::A32::Coprocessor::Callback{callback, (void*)&parent};
+    }
+
+    LOG_CRITICAL(Core_ARM, "CP15: mrrc{} p15, {}, <Rt>, <Rt2>, {}", two ? "2" : "", opc, CRm);
     return {};
 }
 
 std::optional<Callback> DynarmicCP15::CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd,
                                                        std::optional<u8> option) {
+    if (option) {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...], {}", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd, *option);
+    } else {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...]", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd);
+    }
     return {};
 }
 
 std::optional<Callback> DynarmicCP15::CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd,
                                                         std::optional<u8> option) {
+    if (option) {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...], {}", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd, *option);
+    } else {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...]", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd);
+    }
     return {};
 }
+
+} // namespace Core
diff --git a/src/core/arm/dynarmic/arm_dynarmic_cp15.h b/src/core/arm/dynarmic/arm_dynarmic_cp15.h
index 07bcde5f9..7356d252e 100644
--- a/src/core/arm/dynarmic/arm_dynarmic_cp15.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.h
@@ -10,128 +10,15 @@
 #include <dynarmic/A32/coprocessor.h>
 #include "common/common_types.h"
 
-enum class CP15Register {
-    // c0 - Information registers
-    CP15_MAIN_ID,
-    CP15_CACHE_TYPE,
-    CP15_TCM_STATUS,
-    CP15_TLB_TYPE,
-    CP15_CPU_ID,
-    CP15_PROCESSOR_FEATURE_0,
-    CP15_PROCESSOR_FEATURE_1,
-    CP15_DEBUG_FEATURE_0,
-    CP15_AUXILIARY_FEATURE_0,
-    CP15_MEMORY_MODEL_FEATURE_0,
-    CP15_MEMORY_MODEL_FEATURE_1,
-    CP15_MEMORY_MODEL_FEATURE_2,
-    CP15_MEMORY_MODEL_FEATURE_3,
-    CP15_ISA_FEATURE_0,
-    CP15_ISA_FEATURE_1,
-    CP15_ISA_FEATURE_2,
-    CP15_ISA_FEATURE_3,
-    CP15_ISA_FEATURE_4,
+namespace Core {
 
-    // c1 - Control registers
-    CP15_CONTROL,
-    CP15_AUXILIARY_CONTROL,
-    CP15_COPROCESSOR_ACCESS_CONTROL,
-
-    // c2 - Translation table registers
-    CP15_TRANSLATION_BASE_TABLE_0,
-    CP15_TRANSLATION_BASE_TABLE_1,
-    CP15_TRANSLATION_BASE_CONTROL,
-    CP15_DOMAIN_ACCESS_CONTROL,
-    CP15_RESERVED,
-
-    // c5 - Fault status registers
-    CP15_FAULT_STATUS,
-    CP15_INSTR_FAULT_STATUS,
-    CP15_COMBINED_DATA_FSR = CP15_FAULT_STATUS,
-    CP15_INST_FSR,
-
-    // c6 - Fault Address registers
-    CP15_FAULT_ADDRESS,
-    CP15_COMBINED_DATA_FAR = CP15_FAULT_ADDRESS,
-    CP15_WFAR,
-    CP15_IFAR,
-
-    // c7 - Cache operation registers
-    CP15_WAIT_FOR_INTERRUPT,
-    CP15_PHYS_ADDRESS,
-    CP15_INVALIDATE_INSTR_CACHE,
-    CP15_INVALIDATE_INSTR_CACHE_USING_MVA,
-    CP15_INVALIDATE_INSTR_CACHE_USING_INDEX,
-    CP15_FLUSH_PREFETCH_BUFFER,
-    CP15_FLUSH_BRANCH_TARGET_CACHE,
-    CP15_FLUSH_BRANCH_TARGET_CACHE_ENTRY,
-    CP15_INVALIDATE_DATA_CACHE,
-    CP15_INVALIDATE_DATA_CACHE_LINE_USING_MVA,
-    CP15_INVALIDATE_DATA_CACHE_LINE_USING_INDEX,
-    CP15_INVALIDATE_DATA_AND_INSTR_CACHE,
-    CP15_CLEAN_DATA_CACHE,
-    CP15_CLEAN_DATA_CACHE_LINE_USING_MVA,
-    CP15_CLEAN_DATA_CACHE_LINE_USING_INDEX,
-    CP15_DATA_SYNC_BARRIER,
-    CP15_DATA_MEMORY_BARRIER,
-    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE,
-    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_MVA,
-    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_INDEX,
-
-    // c8 - TLB operations
-    CP15_INVALIDATE_ITLB,
-    CP15_INVALIDATE_ITLB_SINGLE_ENTRY,
-    CP15_INVALIDATE_ITLB_ENTRY_ON_ASID_MATCH,
-    CP15_INVALIDATE_ITLB_ENTRY_ON_MVA,
-    CP15_INVALIDATE_DTLB,
-    CP15_INVALIDATE_DTLB_SINGLE_ENTRY,
-    CP15_INVALIDATE_DTLB_ENTRY_ON_ASID_MATCH,
-    CP15_INVALIDATE_DTLB_ENTRY_ON_MVA,
-    CP15_INVALIDATE_UTLB,
-    CP15_INVALIDATE_UTLB_SINGLE_ENTRY,
-    CP15_INVALIDATE_UTLB_ENTRY_ON_ASID_MATCH,
-    CP15_INVALIDATE_UTLB_ENTRY_ON_MVA,
-
-    // c9 - Data cache lockdown register
-    CP15_DATA_CACHE_LOCKDOWN,
-
-    // c10 - TLB/Memory map registers
-    CP15_TLB_LOCKDOWN,
-    CP15_PRIMARY_REGION_REMAP,
-    CP15_NORMAL_REGION_REMAP,
-
-    // c13 - Thread related registers
-    CP15_PID,
-    CP15_CONTEXT_ID,
-    CP15_THREAD_UPRW, // Thread ID register - User/Privileged Read/Write
-    CP15_THREAD_URO,  // Thread ID register - User Read Only (Privileged R/W)
-    CP15_THREAD_PRW,  // Thread ID register - Privileged R/W only.
-
-    // c15 - Performance and TLB lockdown registers
-    CP15_PERFORMANCE_MONITOR_CONTROL,
-    CP15_CYCLE_COUNTER,
-    CP15_COUNT_0,
-    CP15_COUNT_1,
-    CP15_READ_MAIN_TLB_LOCKDOWN_ENTRY,
-    CP15_WRITE_MAIN_TLB_LOCKDOWN_ENTRY,
-    CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS,
-    CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS,
-    CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE,
-    CP15_TLB_DEBUG_CONTROL,
-
-    // Skyeye defined
-    CP15_TLB_FAULT_ADDR,
-    CP15_TLB_FAULT_STATUS,
-
-    // Not an actual register.
-    // All registers should be defined above this.
-    CP15_REGISTER_COUNT,
-};
+class ARM_Dynarmic_32;
 
 class DynarmicCP15 final : public Dynarmic::A32::Coprocessor {
 public:
     using CoprocReg = Dynarmic::A32::CoprocReg;
 
-    explicit DynarmicCP15(u32* cp15) : CP15(cp15){};
+    explicit DynarmicCP15(ARM_Dynarmic_32& parent) : parent(parent) {}
 
     std::optional<Callback> CompileInternalOperation(bool two, unsigned opc1, CoprocReg CRd,
                                                      CoprocReg CRn, CoprocReg CRm,
@@ -147,6 +34,9 @@ public:
     std::optional<Callback> CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd,
                                               std::optional<u8> option) override;
 
-private:
-    u32* CP15{};
+    ARM_Dynarmic_32& parent;
+    u32 uprw;
+    u32 uro;
 };
+
+} // namespace Core
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index 36724569f..c4c5199b1 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -132,7 +132,8 @@ std::shared_ptr<ResourceLimit> Process::GetResourceLimit() const {
 
 u64 Process::GetTotalPhysicalMemoryAvailable() const {
     const u64 capacity{resource_limit->GetCurrentResourceValue(ResourceType::PhysicalMemory) +
-                       page_table->GetTotalHeapSize() + image_size + main_thread_stack_size};
+                       page_table->GetTotalHeapSize() + GetSystemResourceSize() + image_size +
+                       main_thread_stack_size};
 
     if (capacity < memory_usage_capacity) {
         return capacity;
@@ -146,7 +147,8 @@ u64 Process::GetTotalPhysicalMemoryAvailableWithoutSystemResource() const {
 }
 
 u64 Process::GetTotalPhysicalMemoryUsed() const {
-    return image_size + main_thread_stack_size + page_table->GetTotalHeapSize();
+    return image_size + main_thread_stack_size + page_table->GetTotalHeapSize() +
+           GetSystemResourceSize();
 }
 
 u64 Process::GetTotalPhysicalMemoryUsedWithoutSystemResource() const {
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
index cc2192e5c..0d913334e 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
@@ -25,7 +25,7 @@ u32 nvhost_ctrl_gpu::ioctl(Ioctl command, const std::vector<u8>& input,
     case IoctlCommand::IocGetCharacteristicsCommand:
         return GetCharacteristics(input, output, output2, version);
     case IoctlCommand::IocGetTPCMasksCommand:
-        return GetTPCMasks(input, output);
+        return GetTPCMasks(input, output, output2, version);
     case IoctlCommand::IocGetActiveSlotMaskCommand:
         return GetActiveSlotMask(input, output);
     case IoctlCommand::IocZcullGetCtxSizeCommand:
@@ -98,17 +98,22 @@ u32 nvhost_ctrl_gpu::GetCharacteristics(const std::vector<u8>& input, std::vecto
     return 0;
 }
 
-u32 nvhost_ctrl_gpu::GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_ctrl_gpu::GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output,
+                                 std::vector<u8>& output2, IoctlVersion version) {
     IoctlGpuGetTpcMasksArgs params{};
     std::memcpy(&params, input.data(), input.size());
-    LOG_INFO(Service_NVDRV, "called, mask=0x{:X}, mask_buf_addr=0x{:X}", params.mask_buf_size,
-             params.mask_buf_addr);
-    // TODO(ogniK): Confirm value on hardware
-    if (params.mask_buf_size)
-        params.tpc_mask_size = 4 * 1; // 4 * num_gpc
-    else
-        params.tpc_mask_size = 0;
-    std::memcpy(output.data(), &params, sizeof(params));
+    LOG_DEBUG(Service_NVDRV, "called, mask_buffer_size=0x{:X}", params.mask_buffer_size);
+    if (params.mask_buffer_size != 0) {
+        params.tcp_mask = 3;
+    }
+
+    if (version == IoctlVersion::Version3) {
+        std::memcpy(output.data(), input.data(), output.size());
+        std::memcpy(output2.data(), &params.tcp_mask, output2.size());
+    } else {
+        std::memcpy(output.data(), &params, output.size());
+    }
+
     return 0;
 }
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
index 07b644ec5..ef60f72ce 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
@@ -92,16 +92,11 @@ private:
                   "IoctlCharacteristics is incorrect size");
 
     struct IoctlGpuGetTpcMasksArgs {
-        /// [in]  TPC mask buffer size reserved by userspace. Should be at least
-        /// sizeof(__u32) * fls(gpc_mask) to receive TPC mask for each GPC.
-        /// [out] full kernel buffer size
-        u32_le mask_buf_size;
-        u32_le reserved;
-
-        /// [in]  pointer to TPC mask buffer. It will receive one 32-bit TPC mask per GPC or 0 if
-        /// GPC is not enabled or not present. This parameter is ignored if mask_buf_size is 0.
-        u64_le mask_buf_addr;
-        u64_le tpc_mask_size; // Nintendo add this?
+        u32_le mask_buffer_size{};
+        INSERT_PADDING_WORDS(1);
+        u64_le mask_buffer_address{};
+        u32_le tcp_mask{};
+        INSERT_PADDING_WORDS(1);
     };
     static_assert(sizeof(IoctlGpuGetTpcMasksArgs) == 24,
                   "IoctlGpuGetTpcMasksArgs is incorrect size");
@@ -166,7 +161,8 @@ private:
 
     u32 GetCharacteristics(const std::vector<u8>& input, std::vector<u8>& output,
                            std::vector<u8>& output2, IoctlVersion version);
-    u32 GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output, std::vector<u8>& output2,
+                    IoctlVersion version);
     u32 GetActiveSlotMask(const std::vector<u8>& input, std::vector<u8>& output);
     u32 ZCullGetCtxSize(const std::vector<u8>& input, std::vector<u8>& output);
     u32 ZCullGetInfo(const std::vector<u8>& input, std::vector<u8>& output);
diff --git a/src/core/settings.h b/src/core/settings.h
index 36cd66fd4..33e1e06cd 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -437,7 +437,7 @@ struct Values {
     bool renderer_debug;
     int vulkan_device;
 
-    float resolution_factor;
+    u16 resolution_factor{1};
     int aspect_ratio;
     int max_anisotropy;
     bool use_frame_limit;
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 39d5d8401..099bb446e 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -52,6 +52,8 @@ add_library(video_core STATIC
     rasterizer_interface.h
     renderer_base.cpp
     renderer_base.h
+    renderer_opengl/gl_arb_decompiler.cpp
+    renderer_opengl/gl_arb_decompiler.h
     renderer_opengl/gl_buffer_cache.cpp
     renderer_opengl/gl_buffer_cache.h
     renderer_opengl/gl_device.cpp
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
index e35ee0b67..e64170e66 100644
--- a/src/video_core/buffer_cache/buffer_block.h
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -15,48 +15,47 @@ namespace VideoCommon {
 
 class BufferBlock {
 public:
-    bool Overlaps(const VAddr start, const VAddr end) const {
+    bool Overlaps(VAddr start, VAddr end) const {
         return (cpu_addr < end) && (cpu_addr_end > start);
     }
 
-    bool IsInside(const VAddr other_start, const VAddr other_end) const {
+    bool IsInside(VAddr other_start, VAddr other_end) const {
         return cpu_addr <= other_start && other_end <= cpu_addr_end;
     }
 
-    std::size_t GetOffset(const VAddr in_addr) {
+    std::size_t Offset(VAddr in_addr) const {
         return static_cast<std::size_t>(in_addr - cpu_addr);
     }
 
-    VAddr GetCpuAddr() const {
+    VAddr CpuAddr() const {
         return cpu_addr;
     }
 
-    VAddr GetCpuAddrEnd() const {
+    VAddr CpuAddrEnd() const {
         return cpu_addr_end;
     }
 
-    void SetCpuAddr(const VAddr new_addr) {
+    void SetCpuAddr(VAddr new_addr) {
         cpu_addr = new_addr;
         cpu_addr_end = new_addr + size;
     }
 
-    std::size_t GetSize() const {
+    std::size_t Size() const {
         return size;
     }
 
-    void SetEpoch(u64 new_epoch) {
-        epoch = new_epoch;
+    u64 Epoch() const {
+        return epoch;
     }
 
-    u64 GetEpoch() {
-        return epoch;
+    void SetEpoch(u64 new_epoch) {
+        epoch = new_epoch;
     }
 
 protected:
-    explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} {
-        SetCpuAddr(cpu_addr);
+    explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
+        SetCpuAddr(cpu_addr_);
     }
-    ~BufferBlock() = default;
 
 private:
     VAddr cpu_addr{};
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index b88fce2cd..308d8b55f 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -30,12 +30,16 @@
 
 namespace VideoCommon {
 
-template <typename OwnerBuffer, typename BufferType, typename StreamBuffer>
+template <typename Buffer, typename BufferType, typename StreamBuffer>
 class BufferCache {
     using IntervalSet = boost::icl::interval_set<VAddr>;
     using IntervalType = typename IntervalSet::interval_type;
     using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
 
+    static constexpr u64 WRITE_PAGE_BIT = 11;
+    static constexpr u64 BLOCK_PAGE_BITS = 21;
+    static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
+
 public:
     using BufferInfo = std::pair<BufferType, u64>;
 
@@ -82,7 +86,7 @@ public:
             }
         }
 
-        OwnerBuffer block = GetBlock(cpu_addr, size);
+        Buffer* const block = GetBlock(cpu_addr, size);
         MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
         if (!map) {
             return {GetEmptyBuffer(size), 0};
@@ -98,7 +102,7 @@ public:
             }
         }
 
-        return {ToHandle(block), static_cast<u64>(block->GetOffset(cpu_addr))};
+        return {block->Handle(), static_cast<u64>(block->Offset(cpu_addr))};
     }
 
     /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
@@ -110,31 +114,37 @@ public:
         });
     }
 
-    void Map(std::size_t max_size) {
+    /// Prepares the buffer cache for data uploading
+    /// @param max_size Maximum number of bytes that will be uploaded
+    /// @return True when a stream buffer invalidation was required, false otherwise
+    bool Map(std::size_t max_size) {
         std::lock_guard lock{mutex};
 
+        bool invalidated;
         std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
         buffer_offset = buffer_offset_base;
+
+        return invalidated;
     }
 
-    /// Finishes the upload stream, returns true on bindings invalidation.
-    bool Unmap() {
+    /// Finishes the upload stream
+    void Unmap() {
         std::lock_guard lock{mutex};
-
         stream_buffer->Unmap(buffer_offset - buffer_offset_base);
-        return std::exchange(invalidated, false);
     }
 
+    /// Function called at the end of each frame, inteded for deferred operations
     void TickFrame() {
         ++epoch;
+
         while (!pending_destruction.empty()) {
             // Delay at least 4 frames before destruction.
             // This is due to triple buffering happening on some drivers.
             static constexpr u64 epochs_to_destroy = 5;
-            if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) {
+            if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
                 break;
             }
-            pending_destruction.pop_front();
+            pending_destruction.pop();
         }
     }
 
@@ -249,23 +259,21 @@ public:
 
 protected:
     explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                         std::unique_ptr<StreamBuffer> stream_buffer)
-        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
-          stream_buffer_handle{this->stream_buffer->GetHandle()} {}
+                         std::unique_ptr<StreamBuffer> stream_buffer_)
+        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer_)},
+          stream_buffer_handle{stream_buffer->Handle()} {}
 
     ~BufferCache() = default;
 
-    virtual BufferType ToHandle(const OwnerBuffer& storage) = 0;
+    virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
 
-    virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
-
-    virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
+    virtual void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
                                  const u8* data) = 0;
 
-    virtual void DownloadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
+    virtual void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
                                    u8* data) = 0;
 
-    virtual void CopyBlock(const OwnerBuffer& src, const OwnerBuffer& dst, std::size_t src_offset,
+    virtual void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
                            std::size_t dst_offset, std::size_t size) = 0;
 
     virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
@@ -321,7 +329,7 @@ protected:
     }
 
 private:
-    MapInterval* MapAddress(const OwnerBuffer& block, GPUVAddr gpu_addr, VAddr cpu_addr,
+    MapInterval* MapAddress(const Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr,
                             std::size_t size) {
         const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
         if (overlaps.empty()) {
@@ -329,11 +337,11 @@ private:
             const VAddr cpu_addr_end = cpu_addr + size;
             if (memory_manager.IsGranularRange(gpu_addr, size)) {
                 u8* host_ptr = memory_manager.GetPointer(gpu_addr);
-                UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr);
+                UploadBlockData(*block, block->Offset(cpu_addr), size, host_ptr);
             } else {
                 staging_buffer.resize(size);
                 memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data());
+                UploadBlockData(*block, block->Offset(cpu_addr), size, staging_buffer.data());
             }
             return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
         }
@@ -376,7 +384,7 @@ private:
         return map;
     }
 
-    void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end,
+    void UpdateBlock(const Buffer* block, VAddr start, VAddr end,
                      const VectorMapInterval& overlaps) {
         const IntervalType base_interval{start, end};
         IntervalSet interval_set{};
@@ -386,13 +394,13 @@ private:
             interval_set.subtract(subtract);
         }
         for (auto& interval : interval_set) {
-            std::size_t size = interval.upper() - interval.lower();
-            if (size > 0) {
-                staging_buffer.resize(size);
-                system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
-                UploadBlockData(block, block->GetOffset(interval.lower()), size,
-                                staging_buffer.data());
+            const std::size_t size = interval.upper() - interval.lower();
+            if (size == 0) {
+                continue;
             }
+            staging_buffer.resize(size);
+            system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
+            UploadBlockData(*block, block->Offset(interval.lower()), size, staging_buffer.data());
         }
     }
 
@@ -422,10 +430,14 @@ private:
     }
 
     void FlushMap(MapInterval* map) {
+        const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
+        ASSERT_OR_EXECUTE(it != blocks.end(), return;);
+
+        std::shared_ptr<Buffer> block = it->second;
+
         const std::size_t size = map->end - map->start;
-        OwnerBuffer block = blocks[map->start >> block_page_bits];
         staging_buffer.resize(size);
-        DownloadBlockData(block, block->GetOffset(map->start), size, staging_buffer.data());
+        DownloadBlockData(*block, block->Offset(map->start), size, staging_buffer.data());
         system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
         map->MarkAsModified(false, 0);
     }
@@ -448,97 +460,89 @@ private:
         buffer_offset = offset_aligned;
     }
 
-    OwnerBuffer EnlargeBlock(OwnerBuffer buffer) {
-        const std::size_t old_size = buffer->GetSize();
-        const std::size_t new_size = old_size + block_page_size;
-        const VAddr cpu_addr = buffer->GetCpuAddr();
-        OwnerBuffer new_buffer = CreateBlock(cpu_addr, new_size);
-        CopyBlock(buffer, new_buffer, 0, 0, old_size);
-        buffer->SetEpoch(epoch);
-        pending_destruction.push_back(buffer);
+    std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
+        const std::size_t old_size = buffer->Size();
+        const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
+        const VAddr cpu_addr = buffer->CpuAddr();
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
+        CopyBlock(*buffer, *new_buffer, 0, 0, old_size);
+        QueueDestruction(std::move(buffer));
+
         const VAddr cpu_addr_end = cpu_addr + new_size - 1;
-        u64 page_start = cpu_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
-            blocks[page_start] = new_buffer;
-            ++page_start;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
         }
+
         return new_buffer;
     }
 
-    OwnerBuffer MergeBlocks(OwnerBuffer first, OwnerBuffer second) {
-        const std::size_t size_1 = first->GetSize();
-        const std::size_t size_2 = second->GetSize();
-        const VAddr first_addr = first->GetCpuAddr();
-        const VAddr second_addr = second->GetCpuAddr();
+    std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
+                                        std::shared_ptr<Buffer> second) {
+        const std::size_t size_1 = first->Size();
+        const std::size_t size_2 = second->Size();
+        const VAddr first_addr = first->CpuAddr();
+        const VAddr second_addr = second->CpuAddr();
         const VAddr new_addr = std::min(first_addr, second_addr);
         const std::size_t new_size = size_1 + size_2;
-        OwnerBuffer new_buffer = CreateBlock(new_addr, new_size);
-        CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
-        CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
-        first->SetEpoch(epoch);
-        second->SetEpoch(epoch);
-        pending_destruction.push_back(first);
-        pending_destruction.push_back(second);
+
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
+        CopyBlock(*first, *new_buffer, 0, new_buffer->Offset(first_addr), size_1);
+        CopyBlock(*second, *new_buffer, 0, new_buffer->Offset(second_addr), size_2);
+        QueueDestruction(std::move(first));
+        QueueDestruction(std::move(second));
+
         const VAddr cpu_addr_end = new_addr + new_size - 1;
-        u64 page_start = new_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
-            blocks[page_start] = new_buffer;
-            ++page_start;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
         }
         return new_buffer;
     }
 
-    OwnerBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) {
-        OwnerBuffer found;
+    Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
+        std::shared_ptr<Buffer> found;
+
         const VAddr cpu_addr_end = cpu_addr + size - 1;
-        u64 page_start = cpu_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
             auto it = blocks.find(page_start);
             if (it == blocks.end()) {
                 if (found) {
                     found = EnlargeBlock(found);
-                } else {
-                    const VAddr start_addr = (page_start << block_page_bits);
-                    found = CreateBlock(start_addr, block_page_size);
-                    blocks[page_start] = found;
-                }
-            } else {
-                if (found) {
-                    if (found == it->second) {
-                        ++page_start;
-                        continue;
-                    }
-                    found = MergeBlocks(found, it->second);
-                } else {
-                    found = it->second;
+                    continue;
                 }
+                const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
+                found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
+                blocks.insert_or_assign(page_start, found);
+                continue;
+            }
+            if (!found) {
+                found = it->second;
+                continue;
+            }
+            if (found != it->second) {
+                found = MergeBlocks(std::move(found), it->second);
             }
-            ++page_start;
         }
-        return found;
+        return found.get();
     }
 
-    void MarkRegionAsWritten(const VAddr start, const VAddr end) {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    void MarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
             auto it = written_pages.find(page_start);
             if (it != written_pages.end()) {
                 it->second = it->second + 1;
             } else {
-                written_pages[page_start] = 1;
+                written_pages.insert_or_assign(page_start, 1);
             }
-            ++page_start;
         }
     }
 
-    void UnmarkRegionAsWritten(const VAddr start, const VAddr end) {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    void UnmarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
             auto it = written_pages.find(page_start);
             if (it != written_pages.end()) {
                 if (it->second > 1) {
@@ -547,22 +551,24 @@ private:
                     written_pages.erase(it);
                 }
             }
-            ++page_start;
         }
     }
 
-    bool IsRegionWritten(const VAddr start, const VAddr end) const {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    bool IsRegionWritten(VAddr start, VAddr end) const {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
             if (written_pages.count(page_start) > 0) {
                 return true;
             }
-            ++page_start;
         }
         return false;
     }
 
+    void QueueDestruction(std::shared_ptr<Buffer> buffer) {
+        buffer->SetEpoch(epoch);
+        pending_destruction.push(std::move(buffer));
+    }
+
     void MarkForAsyncFlush(MapInterval* map) {
         if (!uncommitted_flushes) {
             uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
@@ -574,9 +580,7 @@ private:
     Core::System& system;
 
     std::unique_ptr<StreamBuffer> stream_buffer;
-    BufferType stream_buffer_handle{};
-
-    bool invalidated = false;
+    BufferType stream_buffer_handle;
 
     u8* buffer_ptr = nullptr;
     u64 buffer_offset = 0;
@@ -586,18 +590,15 @@ private:
     boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
         mapped_addresses;
 
-    static constexpr u64 write_page_bit = 11;
     std::unordered_map<u64, u32> written_pages;
+    std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;
 
-    static constexpr u64 block_page_bits = 21;
-    static constexpr u64 block_page_size = 1ULL << block_page_bits;
-    std::unordered_map<u64, OwnerBuffer> blocks;
-
-    std::list<OwnerBuffer> pending_destruction;
+    std::queue<std::shared_ptr<Buffer>> pending_destruction;
     u64 epoch = 0;
     u64 modified_ticks = 0;
 
     std::vector<u8> staging_buffer;
+
     std::list<MapInterval*> marked_for_unregister;
 
     std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h
index ebe139504..f46e81bb7 100644
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -93,6 +93,7 @@ public:
     virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0;
     virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                                     u64 offset) const = 0;
+    virtual SamplerDescriptor AccessSampler(u32 handle) const = 0;
     virtual u32 GetBoundBuffer() const = 0;
 
     virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0;
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index f6237fc6a..a82b06a38 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -92,8 +92,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con
     ASSERT(stage == ShaderType::Compute);
     const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer];
     const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
 
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
     const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
     SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
     result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index 18ceedfaf..b7f668d88 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -219,6 +219,8 @@ public:
     SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                             u64 offset) const override;
 
+    SamplerDescriptor AccessSampler(u32 handle) const override;
+
     u32 GetBoundBuffer() const override {
         return regs.tex_cb_index;
     }
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index e46b153f9..ea3c8a963 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -740,8 +740,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b
     const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
     const auto& tex_info_buffer = shader.const_buffers[const_buffer];
     const GPUVAddr tex_info_address = tex_info_buffer.address + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
 
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
     const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
     SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
     result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 79fc9bbea..d5fe25065 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1404,6 +1404,8 @@ public:
     SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                             u64 offset) const override;
 
+    SamplerDescriptor AccessSampler(u32 handle) const override;
+
     u32 GetBoundBuffer() const override {
         return regs.tex_cb_index;
     }
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
index 11c1cc3be..bee34a7c0 100644
--- a/src/video_core/macro/macro_jit_x64.cpp
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -14,22 +14,16 @@ MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255
 MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
 
 namespace Tegra {
-static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9;
-static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10;
-static const Xbyak::Reg64 STATE = Xbyak::util::r11;
-static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12;
-static const Xbyak::Reg32 RESULT = Xbyak::util::r13d;
-static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13;
+static const Xbyak::Reg64 STATE = Xbyak::util::rbx;
+static const Xbyak::Reg32 RESULT = Xbyak::util::ebp;
+static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
 static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
-static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14;
 static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
 
 static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
-    PARAMETERS,
-    REGISTERS,
     STATE,
-    NEXT_PARAMETER,
     RESULT,
+    PARAMETERS,
     METHOD_ADDRESS,
     BRANCH_HOLDER,
 });
@@ -53,8 +47,7 @@ void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
     JITState state{};
     state.maxwell3d = &maxwell3d;
     state.registers = {};
-    state.parameters = parameters.data();
-    program(&state);
+    program(&state, parameters.data());
 }
 
 void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
@@ -63,19 +56,21 @@ void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
     const bool valid_operation = !is_a_zero && !is_b_zero;
     const bool is_move_operation = !is_a_zero && is_b_zero;
     const bool has_zero_register = is_a_zero || is_b_zero;
+    const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                                  opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow;
 
-    Xbyak::Reg64 src_a;
+    Xbyak::Reg32 src_a;
     Xbyak::Reg32 src_b;
 
-    if (!optimizer.zero_reg_skip) {
-        src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
-        src_b = Compile_GetRegister(opcode.src_b, ebx);
+    if (!optimizer.zero_reg_skip || no_zero_reg_skip) {
+        src_a = Compile_GetRegister(opcode.src_a, RESULT);
+        src_b = Compile_GetRegister(opcode.src_b, eax);
     } else {
         if (!is_a_zero) {
-            src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
+            src_a = Compile_GetRegister(opcode.src_a, RESULT);
         }
         if (!is_b_zero) {
-            src_b = Compile_GetRegister(opcode.src_b, ebx);
+            src_b = Compile_GetRegister(opcode.src_b, eax);
         }
     }
     Xbyak::Label skip_carry{};
@@ -190,7 +185,8 @@ void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
         opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
         if (next_opcode.has_value()) {
             const auto next = *next_opcode;
-            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod &&
+                opcode.dst == next.dst) {
                 return;
             }
         }
@@ -302,22 +298,22 @@ void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
             sub(result, opcode.immediate * -1);
         }
     }
-    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
     mov(Common::X64::ABI_PARAM1, qword[STATE]);
     mov(Common::X64::ABI_PARAM2, RESULT);
     Common::X64::CallFarFunction(*this, &Read);
-    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
     mov(RESULT, Common::X64::ABI_RETURN.cvt32());
     Compile_ProcessResult(opcode.result_operation, opcode.dst);
 }
 
 void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
-    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
     mov(Common::X64::ABI_PARAM1, qword[STATE]);
     mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
     mov(Common::X64::ABI_PARAM3, value);
     Common::X64::CallFarFunction(*this, &Send);
-    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
 
     Xbyak::Label dont_process{};
     // Get increment
@@ -329,7 +325,7 @@ void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
     and_(METHOD_ADDRESS, 0xfff);
     shr(ecx, 12);
     and_(ecx, 0x3f);
-    lea(eax, ptr[rcx + METHOD_ADDRESS_64]);
+    lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]);
     sal(ecx, 12);
     or_(eax, ecx);
 
@@ -421,19 +417,15 @@ void MacroJITx64Impl::Compile() {
     bool keep_executing = true;
     labels.fill(Xbyak::Label());
 
-    Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
     // JIT state
     mov(STATE, Common::X64::ABI_PARAM1);
-    mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 +
-                          static_cast<Xbyak::uint32>(offsetof(JITState, parameters))]);
-    mov(REGISTERS, Common::X64::ABI_PARAM1);
-    add(REGISTERS, static_cast<Xbyak::uint32>(offsetof(JITState, registers)));
+    mov(PARAMETERS, Common::X64::ABI_PARAM2);
     xor_(RESULT, RESULT);
     xor_(METHOD_ADDRESS, METHOD_ADDRESS);
-    xor_(NEXT_PARAMETER, NEXT_PARAMETER);
     xor_(BRANCH_HOLDER, BRANCH_HOLDER);
 
-    mov(dword[REGISTERS + 4], Compile_FetchParameter());
+    mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter());
 
     // Track get register for zero registers and mark it as no-op
     optimizer.zero_reg_skip = true;
@@ -463,7 +455,7 @@ void MacroJITx64Impl::Compile() {
 
     L(end_of_code);
 
-    Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
     ret();
     ready();
     program = getCode<ProgramType>();
@@ -537,8 +529,8 @@ bool MacroJITx64Impl::Compile_NextInstruction() {
 }
 
 Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
-    mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]);
-    inc(NEXT_PARAMETER);
+    mov(eax, dword[PARAMETERS]);
+    add(PARAMETERS, sizeof(u32));
     return eax;
 }
 
@@ -547,31 +539,12 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
         // Register 0 is always zero
         xor_(dst, dst);
     } else {
-        mov(dst, dword[REGISTERS + index * sizeof(u32)]);
+        mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]);
     }
 
     return dst;
 }
 
-Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) {
-    if (index == 0) {
-        // Register 0 is always zero
-        xor_(dst, dst);
-    } else {
-        mov(dst, dword[REGISTERS + index * sizeof(u32)]);
-    }
-
-    return dst;
-}
-
-void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) {
-    Xbyak::Label zero{}, end{};
-    xor_(ecx, ecx);
-    shr(dst, 32);
-    setne(cl);
-    mov(dword[STATE + offsetof(JITState, carry_flag)], ecx);
-}
-
 void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
     auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) {
         // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
@@ -579,7 +552,7 @@ void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u3
         if (reg == 0) {
             return;
         }
-        mov(dword[REGISTERS + reg * sizeof(u32)], result);
+        mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result);
     };
     auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); };
 
diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h
index 21ee157cf..51ec090b8 100644
--- a/src/video_core/macro/macro_jit_x64.h
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -55,8 +55,6 @@ private:
 
     Xbyak::Reg32 Compile_FetchParameter();
     Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
-    Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst);
-    void Compile_WriteCarry(Xbyak::Reg64 dst);
 
     void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
     void Compile_Send(Xbyak::Reg32 value);
@@ -67,11 +65,10 @@ private:
     struct JITState {
         Engines::Maxwell3D* maxwell3d{};
         std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
-        const u32* parameters{};
         u32 carry_flag{};
     };
     static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
-    using ProgramType = void (*)(JITState*);
+    using ProgramType = void (*)(JITState*, const u32*);
 
     struct OptimizerState {
         bool can_skip_carry{};
@@ -85,8 +82,8 @@ private:
     std::optional<Macro::Opcode> next_opcode{};
     ProgramType program{nullptr};
 
-    std::array<Xbyak::Label, MAX_CODE_SIZE> labels{};
-    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip{};
+    std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
+    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
     Xbyak::Label end_of_code{};
 
     bool is_delay_slot{};
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
new file mode 100644
index 000000000..1e96b0310
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -0,0 +1,2074 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <variant>
+
+#include <fmt/format.h>
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
+#include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/shader/registry.h"
+#include "video_core/shader/shader_ir.h"
+
+// Predicates in the decompiled code follow the convention that -1 means true and 0 means false.
+// GLASM lacks booleans, so they have to be implemented as integers.
+// Using -1 for true is useful because both CMP.S and NOT.U can negate it, and CMP.S can be used to
+// select between two values, because -1 will be evaluated as true and 0 as false.
+
+namespace OpenGL {
+
+namespace {
+
+using Tegra::Engines::ShaderType;
+using Tegra::Shader::Attribute;
+using Tegra::Shader::PixelImap;
+using Tegra::Shader::Register;
+using namespace VideoCommon::Shader;
+using Operation = const OperationNode&;
+
+constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"};
+
+char Swizzle(std::size_t component) {
+    ASSERT(component < 4);
+    return component["xyzw"];
+}
+
+constexpr bool IsGenericAttribute(Attribute::Index index) {
+    return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31;
+}
+
+u32 GetGenericAttributeIndex(Attribute::Index index) {
+    ASSERT(IsGenericAttribute(index));
+    return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
+}
+
+std::string_view Modifiers(Operation operation) {
+    const auto meta = std::get_if<MetaArithmetic>(&operation.GetMeta());
+    if (meta && meta->precise) {
+        return ".PREC";
+    }
+    return "";
+}
+
+std::string_view GetInputFlags(PixelImap attribute) {
+    switch (attribute) {
+    case PixelImap::Perspective:
+        return "";
+    case PixelImap::Constant:
+        return "FLAT ";
+    case PixelImap::ScreenLinear:
+        return "NOPERSPECTIVE ";
+    case PixelImap::Unused:
+        break;
+    }
+    UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute));
+    return {};
+}
+
+std::string_view ImageType(Tegra::Shader::ImageType image_type) {
+    switch (image_type) {
+    case Tegra::Shader::ImageType::Texture1D:
+        return "1D";
+    case Tegra::Shader::ImageType::TextureBuffer:
+        return "BUFFER";
+    case Tegra::Shader::ImageType::Texture1DArray:
+        return "ARRAY1D";
+    case Tegra::Shader::ImageType::Texture2D:
+        return "2D";
+    case Tegra::Shader::ImageType::Texture2DArray:
+        return "ARRAY2D";
+    case Tegra::Shader::ImageType::Texture3D:
+        return "3D";
+    }
+    UNREACHABLE();
+    return {};
+}
+
+std::string_view StackName(MetaStackClass stack) {
+    switch (stack) {
+    case MetaStackClass::Ssy:
+        return "SSY";
+    case MetaStackClass::Pbk:
+        return "PBK";
+    }
+    UNREACHABLE();
+    return "";
+};
+
+std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) {
+    switch (topology) {
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points:
+        return "POINTS";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip:
+        return "LINES";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
+        return "LINES_ADJACENCY";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
+        return "TRIANGLES";
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
+    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
+        return "TRIANGLES_ADJACENCY";
+    default:
+        UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
+        return "POINTS";
+    }
+}
+
+std::string_view TopologyName(Tegra::Shader::OutputTopology topology) {
+    switch (topology) {
+    case Tegra::Shader::OutputTopology::PointList:
+        return "POINTS";
+    case Tegra::Shader::OutputTopology::LineStrip:
+        return "LINE_STRIP";
+    case Tegra::Shader::OutputTopology::TriangleStrip:
+        return "TRIANGLE_STRIP";
+    default:
+        UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology));
+        return "points";
+    }
+}
+
+std::string_view StageInputName(ShaderType stage) {
+    switch (stage) {
+    case ShaderType::Vertex:
+    case ShaderType::Geometry:
+        return "vertex";
+    case ShaderType::Fragment:
+        return "fragment";
+    case ShaderType::Compute:
+        return "invocation";
+    default:
+        UNREACHABLE();
+        return "";
+    }
+}
+
+std::string TextureType(const MetaTexture& meta) {
+    if (meta.sampler.is_buffer) {
+        return "BUFFER";
+    }
+    std::string type;
+    if (meta.sampler.is_shadow) {
+        type += "SHADOW";
+    }
+    if (meta.sampler.is_array) {
+        type += "ARRAY";
+    }
+    type += [&meta] {
+        switch (meta.sampler.type) {
+        case Tegra::Shader::TextureType::Texture1D:
+            return "1D";
+        case Tegra::Shader::TextureType::Texture2D:
+            return "2D";
+        case Tegra::Shader::TextureType::Texture3D:
+            return "3D";
+        case Tegra::Shader::TextureType::TextureCube:
+            return "CUBE";
+        }
+        UNREACHABLE();
+        return "2D";
+    }();
+    return type;
+}
+
+std::string GlobalMemoryName(const GlobalMemoryBase& base) {
+    return fmt::format("gmem{}_{}", base.cbuf_index, base.cbuf_offset);
+}
+
+class ARBDecompiler final {
+public:
+    explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+                           ShaderType stage, std::string_view identifier);
+
+    std::string Code() const {
+        return shader_source;
+    }
+
+private:
+    void DeclareHeader();
+    void DeclareVertex();
+    void DeclareGeometry();
+    void DeclareFragment();
+    void DeclareCompute();
+    void DeclareInputAttributes();
+    void DeclareOutputAttributes();
+    void DeclareLocalMemory();
+    void DeclareGlobalMemory();
+    void DeclareConstantBuffers();
+    void DeclareRegisters();
+    void DeclareTemporaries();
+    void DeclarePredicates();
+    void DeclareInternalFlags();
+
+    void InitializeVariables();
+
+    void DecompileAST();
+    void DecompileBranchMode();
+
+    void VisitAST(const ASTNode& node);
+    std::string VisitExpression(const Expr& node);
+
+    void VisitBlock(const NodeBlock& bb);
+
+    std::string Visit(const Node& node);
+
+    std::pair<std::string, std::size_t> BuildCoords(Operation);
+    std::string BuildAoffi(Operation);
+    void Exit();
+
+    std::string Assign(Operation);
+    std::string Select(Operation);
+    std::string FClamp(Operation);
+    std::string FCastHalf0(Operation);
+    std::string FCastHalf1(Operation);
+    std::string FSqrt(Operation);
+    std::string FSwizzleAdd(Operation);
+    std::string HAdd2(Operation);
+    std::string HMul2(Operation);
+    std::string HFma2(Operation);
+    std::string HAbsolute(Operation);
+    std::string HNegate(Operation);
+    std::string HClamp(Operation);
+    std::string HCastFloat(Operation);
+    std::string HUnpack(Operation);
+    std::string HMergeF32(Operation);
+    std::string HMergeH0(Operation);
+    std::string HMergeH1(Operation);
+    std::string HPack2(Operation);
+    std::string LogicalAssign(Operation);
+    std::string LogicalPick2(Operation);
+    std::string LogicalAnd2(Operation);
+    std::string FloatOrdered(Operation);
+    std::string FloatUnordered(Operation);
+    std::string LogicalAddCarry(Operation);
+    std::string Texture(Operation);
+    std::string TextureGather(Operation);
+    std::string TextureQueryDimensions(Operation);
+    std::string TextureQueryLod(Operation);
+    std::string TexelFetch(Operation);
+    std::string TextureGradient(Operation);
+    std::string ImageLoad(Operation);
+    std::string ImageStore(Operation);
+    std::string Branch(Operation);
+    std::string BranchIndirect(Operation);
+    std::string PushFlowStack(Operation);
+    std::string PopFlowStack(Operation);
+    std::string Exit(Operation);
+    std::string Discard(Operation);
+    std::string EmitVertex(Operation);
+    std::string EndPrimitive(Operation);
+    std::string InvocationId(Operation);
+    std::string YNegate(Operation);
+    std::string ThreadId(Operation);
+    std::string ShuffleIndexed(Operation);
+    std::string Barrier(Operation);
+    std::string MemoryBarrierGroup(Operation);
+    std::string MemoryBarrierGlobal(Operation);
+
+    template <const std::string_view& op>
+    std::string Unary(Operation operation) {
+        const std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]));
+        return temporary;
+    }
+
+    template <const std::string_view& op>
+    std::string Binary(Operation operation) {
+        const std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+                Visit(operation[1]));
+        return temporary;
+    }
+
+    template <const std::string_view& op>
+    std::string Trinary(Operation operation) {
+        const std::string temporary = AllocTemporary();
+        AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+                Visit(operation[1]), Visit(operation[2]));
+        return temporary;
+    }
+
+    template <const std::string_view& op, bool unordered>
+    std::string FloatComparison(Operation operation) {
+        const std::string temporary = AllocTemporary();
+        AddLine("TRUNC.U.CC RC.x, {};", Binary<op>(operation));
+        AddLine("MOV.S {}, 0;", temporary);
+        AddLine("MOV.S {} (NE.x), -1;", temporary);
+
+        const std::string op_a = Visit(operation[0]);
+        const std::string op_b = Visit(operation[1]);
+        if constexpr (unordered) {
+            AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), -1;", temporary);
+            AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), -1;", temporary);
+        } else if (op == SNE_F) {
+            AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), 0;", temporary);
+            AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+            AddLine("TRUNC.U.CC RC.x, RC.x;");
+            AddLine("MOV.S {} (NE.x), 0;", temporary);
+        }
+        return temporary;
+    }
+
+    template <const std::string_view& op, bool is_nan>
+    std::string HalfComparison(Operation operation) {
+        const std::string tmp1 = AllocVectorTemporary();
+        const std::string tmp2 = AllocVectorTemporary();
+        const std::string op_a = Visit(operation[0]);
+        const std::string op_b = Visit(operation[1]);
+        AddLine("UP2H.F {}, {};", tmp1, op_a);
+        AddLine("UP2H.F {}, {};", tmp2, op_b);
+        AddLine("{} {}, {}, {};", op, tmp1, tmp1, tmp2);
+        AddLine("TRUNC.U.CC RC.xy, {};", tmp1);
+        AddLine("MOV.S {}.xy, {{0, 0, 0, 0}};", tmp1);
+        AddLine("MOV.S {}.x (NE.x), -1;", tmp1);
+        AddLine("MOV.S {}.y (NE.y), -1;", tmp1);
+        if constexpr (is_nan) {
+            AddLine("MOVC.F RC.x, {};", op_a);
+            AddLine("MOV.S {}.x (NAN.x), -1;", tmp1);
+            AddLine("MOVC.F RC.x, {};", op_b);
+            AddLine("MOV.S {}.y (NAN.x), -1;", tmp1);
+        }
+        return tmp1;
+    }
+
+    template <const std::string_view& op, const std::string_view& type>
+    std::string AtomicImage(Operation operation) {
+        const auto& meta = std::get<MetaImage>(operation.GetMeta());
+        const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+        const std::size_t num_coords = operation.GetOperandsCount();
+        const std::size_t num_values = meta.values.size();
+
+        const std::string coord = AllocVectorTemporary();
+        const std::string value = AllocVectorTemporary();
+        for (std::size_t i = 0; i < num_coords; ++i) {
+            AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+        }
+        for (std::size_t i = 0; i < num_values; ++i) {
+            AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+        }
+
+        const std::string result = coord;
+        AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, result, value, coord,
+                image_id, ImageType(meta.image.type));
+        return fmt::format("{}.x", result);
+    }
+
+    template <const std::string_view& op, const std::string_view& type>
+    std::string Atomic(Operation operation) {
+        const std::string temporary = AllocTemporary();
+        std::string address;
+        std::string_view opname;
+        if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
+            AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+                    Visit(gmem->GetBaseAddress()));
+            address = fmt::format("{}[{}]", GlobalMemoryName(gmem->GetDescriptor()), temporary);
+            opname = "ATOMB";
+        } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
+            address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
+            opname = "ATOMS";
+        } else {
+            UNREACHABLE();
+            return "{0, 0, 0, 0}";
+        }
+        AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address);
+        return temporary;
+    }
+
+    template <char type>
+    std::string Negate(Operation operation) {
+        const std::string temporary = AllocTemporary();
+        if constexpr (type == 'F') {
+            AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0]));
+        } else {
+            AddLine("MOV.{} {}, -{};", type, temporary, Visit(operation[0]));
+        }
+        return temporary;
+    }
+
+    template <char type>
+    std::string Absolute(Operation operation) {
+        const std::string temporary = AllocTemporary();
+        AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0]));
+        return temporary;
+    }
+
+    template <char type>
+    std::string BitfieldInsert(Operation operation) {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[3]));
+        AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[2]));
+        AddLine("BFI.{} {}.x, {}, {}, {};", type, temporary, temporary, Visit(operation[1]),
+                Visit(operation[0]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    template <char type>
+    std::string BitfieldExtract(Operation operation) {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[2]));
+        AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[1]));
+        AddLine("BFE.{} {}.x, {}, {};", type, temporary, temporary, Visit(operation[0]));
+        return fmt::format("{}.x", temporary);
+    }
+
+    template <char swizzle>
+    std::string LocalInvocationId(Operation) {
+        return fmt::format("invocation.localid.{}", swizzle);
+    }
+
+    template <char swizzle>
+    std::string WorkGroupId(Operation) {
+        return fmt::format("invocation.groupid.{}", swizzle);
+    }
+
+    template <char c1, char c2>
+    std::string ThreadMask(Operation) {
+        return fmt::format("{}.thread{}{}mask", StageInputName(stage), c1, c2);
+    }
+
+    template <typename... Args>
+    void AddExpression(std::string_view text, Args&&... args) {
+        shader_source += fmt::format(text, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void AddLine(std::string_view text, Args&&... args) {
+        AddExpression(text, std::forward<Args>(args)...);
+        shader_source += '\n';
+    }
+
+    std::string AllocTemporary() {
+        max_temporaries = std::max(max_temporaries, num_temporaries + 1);
+        return fmt::format("T{}.x", num_temporaries++);
+    }
+
+    std::string AllocVectorTemporary() {
+        max_temporaries = std::max(max_temporaries, num_temporaries + 1);
+        return fmt::format("T{}", num_temporaries++);
+    }
+
+    void ResetTemporaries() noexcept {
+        num_temporaries = 0;
+    }
+
+    const Device& device;
+    const ShaderIR& ir;
+    const Registry& registry;
+    const ShaderType stage;
+
+    std::size_t num_temporaries = 0;
+    std::size_t max_temporaries = 0;
+
+    std::string shader_source;
+
+    static constexpr std::string_view ADD_F32 = "ADD.F32";
+    static constexpr std::string_view ADD_S = "ADD.S";
+    static constexpr std::string_view ADD_U = "ADD.U";
+    static constexpr std::string_view MUL_F32 = "MUL.F32";
+    static constexpr std::string_view MUL_S = "MUL.S";
+    static constexpr std::string_view MUL_U = "MUL.U";
+    static constexpr std::string_view DIV_F32 = "DIV.F32";
+    static constexpr std::string_view DIV_S = "DIV.S";
+    static constexpr std::string_view DIV_U = "DIV.U";
+    static constexpr std::string_view MAD_F32 = "MAD.F32";
+    static constexpr std::string_view RSQ_F32 = "RSQ.F32";
+    static constexpr std::string_view COS_F32 = "COS.F32";
+    static constexpr std::string_view SIN_F32 = "SIN.F32";
+    static constexpr std::string_view EX2_F32 = "EX2.F32";
+    static constexpr std::string_view LG2_F32 = "LG2.F32";
+    static constexpr std::string_view SLT_F = "SLT.F32";
+    static constexpr std::string_view SLT_S = "SLT.S";
+    static constexpr std::string_view SLT_U = "SLT.U";
+    static constexpr std::string_view SEQ_F = "SEQ.F32";
+    static constexpr std::string_view SEQ_S = "SEQ.S";
+    static constexpr std::string_view SEQ_U = "SEQ.U";
+    static constexpr std::string_view SLE_F = "SLE.F32";
+    static constexpr std::string_view SLE_S = "SLE.S";
+    static constexpr std::string_view SLE_U = "SLE.U";
+    static constexpr std::string_view SGT_F = "SGT.F32";
+    static constexpr std::string_view SGT_S = "SGT.S";
+    static constexpr std::string_view SGT_U = "SGT.U";
+    static constexpr std::string_view SNE_F = "SNE.F32";
+    static constexpr std::string_view SNE_S = "SNE.S";
+    static constexpr std::string_view SNE_U = "SNE.U";
+    static constexpr std::string_view SGE_F = "SGE.F32";
+    static constexpr std::string_view SGE_S = "SGE.S";
+    static constexpr std::string_view SGE_U = "SGE.U";
+    static constexpr std::string_view AND_S = "AND.S";
+    static constexpr std::string_view AND_U = "AND.U";
+    static constexpr std::string_view TRUNC_F = "TRUNC.F";
+    static constexpr std::string_view TRUNC_S = "TRUNC.S";
+    static constexpr std::string_view TRUNC_U = "TRUNC.U";
+    static constexpr std::string_view SHL_S = "SHL.S";
+    static constexpr std::string_view SHL_U = "SHL.U";
+    static constexpr std::string_view SHR_S = "SHR.S";
+    static constexpr std::string_view SHR_U = "SHR.U";
+    static constexpr std::string_view OR_S = "OR.S";
+    static constexpr std::string_view OR_U = "OR.U";
+    static constexpr std::string_view XOR_S = "XOR.S";
+    static constexpr std::string_view XOR_U = "XOR.U";
+    static constexpr std::string_view NOT_S = "NOT.S";
+    static constexpr std::string_view NOT_U = "NOT.U";
+    static constexpr std::string_view BTC_S = "BTC.S";
+    static constexpr std::string_view BTC_U = "BTC.U";
+    static constexpr std::string_view BTFM_S = "BTFM.S";
+    static constexpr std::string_view BTFM_U = "BTFM.U";
+    static constexpr std::string_view ROUND_F = "ROUND.F";
+    static constexpr std::string_view CEIL_F = "CEIL.F";
+    static constexpr std::string_view FLR_F = "FLR.F";
+    static constexpr std::string_view I2F_S = "I2F.S";
+    static constexpr std::string_view I2F_U = "I2F.U";
+    static constexpr std::string_view MIN_F = "MIN.F";
+    static constexpr std::string_view MIN_S = "MIN.S";
+    static constexpr std::string_view MIN_U = "MIN.U";
+    static constexpr std::string_view MAX_F = "MAX.F";
+    static constexpr std::string_view MAX_S = "MAX.S";
+    static constexpr std::string_view MAX_U = "MAX.U";
+    static constexpr std::string_view MOV_U = "MOV.U";
+    static constexpr std::string_view TGBALLOT_U = "TGBALLOT.U";
+    static constexpr std::string_view TGALL_U = "TGALL.U";
+    static constexpr std::string_view TGANY_U = "TGANY.U";
+    static constexpr std::string_view TGEQ_U = "TGEQ.U";
+    static constexpr std::string_view EXCH = "EXCH";
+    static constexpr std::string_view ADD = "ADD";
+    static constexpr std::string_view MIN = "MIN";
+    static constexpr std::string_view MAX = "MAX";
+    static constexpr std::string_view AND = "AND";
+    static constexpr std::string_view OR = "OR";
+    static constexpr std::string_view XOR = "XOR";
+    static constexpr std::string_view U32 = "U32";
+    static constexpr std::string_view S32 = "S32";
+
+    static constexpr std::size_t NUM_ENTRIES = static_cast<std::size_t>(OperationCode::Amount);
+    using DecompilerType = std::string (ARBDecompiler::*)(Operation);
+    static constexpr std::array<DecompilerType, NUM_ENTRIES> OPERATION_DECOMPILERS = {
+        &ARBDecompiler::Assign,
+
+        &ARBDecompiler::Select,
+
+        &ARBDecompiler::Binary<ADD_F32>,
+        &ARBDecompiler::Binary<MUL_F32>,
+        &ARBDecompiler::Binary<DIV_F32>,
+        &ARBDecompiler::Trinary<MAD_F32>,
+        &ARBDecompiler::Negate<'F'>,
+        &ARBDecompiler::Absolute<'F'>,
+        &ARBDecompiler::FClamp,
+        &ARBDecompiler::FCastHalf0,
+        &ARBDecompiler::FCastHalf1,
+        &ARBDecompiler::Binary<MIN_F>,
+        &ARBDecompiler::Binary<MAX_F>,
+        &ARBDecompiler::Unary<COS_F32>,
+        &ARBDecompiler::Unary<SIN_F32>,
+        &ARBDecompiler::Unary<EX2_F32>,
+        &ARBDecompiler::Unary<LG2_F32>,
+        &ARBDecompiler::Unary<RSQ_F32>,
+        &ARBDecompiler::FSqrt,
+        &ARBDecompiler::Unary<ROUND_F>,
+        &ARBDecompiler::Unary<FLR_F>,
+        &ARBDecompiler::Unary<CEIL_F>,
+        &ARBDecompiler::Unary<TRUNC_F>,
+        &ARBDecompiler::Unary<I2F_S>,
+        &ARBDecompiler::Unary<I2F_U>,
+        &ARBDecompiler::FSwizzleAdd,
+
+        &ARBDecompiler::Binary<ADD_S>,
+        &ARBDecompiler::Binary<MUL_S>,
+        &ARBDecompiler::Binary<DIV_S>,
+        &ARBDecompiler::Negate<'S'>,
+        &ARBDecompiler::Absolute<'S'>,
+        &ARBDecompiler::Binary<MIN_S>,
+        &ARBDecompiler::Binary<MAX_S>,
+
+        &ARBDecompiler::Unary<TRUNC_S>,
+        &ARBDecompiler::Unary<MOV_U>,
+        &ARBDecompiler::Binary<SHL_S>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<SHR_S>,
+        &ARBDecompiler::Binary<AND_S>,
+        &ARBDecompiler::Binary<OR_S>,
+        &ARBDecompiler::Binary<XOR_S>,
+        &ARBDecompiler::Unary<NOT_S>,
+        &ARBDecompiler::BitfieldInsert<'S'>,
+        &ARBDecompiler::BitfieldExtract<'S'>,
+        &ARBDecompiler::Unary<BTC_S>,
+        &ARBDecompiler::Unary<BTFM_S>,
+
+        &ARBDecompiler::Binary<ADD_U>,
+        &ARBDecompiler::Binary<MUL_U>,
+        &ARBDecompiler::Binary<DIV_U>,
+        &ARBDecompiler::Binary<MIN_U>,
+        &ARBDecompiler::Binary<MAX_U>,
+        &ARBDecompiler::Unary<TRUNC_U>,
+        &ARBDecompiler::Unary<MOV_U>,
+        &ARBDecompiler::Binary<SHL_U>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<SHR_U>,
+        &ARBDecompiler::Binary<AND_U>,
+        &ARBDecompiler::Binary<OR_U>,
+        &ARBDecompiler::Binary<XOR_U>,
+        &ARBDecompiler::Unary<NOT_U>,
+        &ARBDecompiler::BitfieldInsert<'U'>,
+        &ARBDecompiler::BitfieldExtract<'U'>,
+        &ARBDecompiler::Unary<BTC_U>,
+        &ARBDecompiler::Unary<BTFM_U>,
+
+        &ARBDecompiler::HAdd2,
+        &ARBDecompiler::HMul2,
+        &ARBDecompiler::HFma2,
+        &ARBDecompiler::HAbsolute,
+        &ARBDecompiler::HNegate,
+        &ARBDecompiler::HClamp,
+        &ARBDecompiler::HCastFloat,
+        &ARBDecompiler::HUnpack,
+        &ARBDecompiler::HMergeF32,
+        &ARBDecompiler::HMergeH0,
+        &ARBDecompiler::HMergeH1,
+        &ARBDecompiler::HPack2,
+
+        &ARBDecompiler::LogicalAssign,
+        &ARBDecompiler::Binary<AND_U>,
+        &ARBDecompiler::Binary<OR_U>,
+        &ARBDecompiler::Binary<XOR_U>,
+        &ARBDecompiler::Unary<NOT_U>,
+        &ARBDecompiler::LogicalPick2,
+        &ARBDecompiler::LogicalAnd2,
+
+        &ARBDecompiler::FloatComparison<SLT_F, false>,
+        &ARBDecompiler::FloatComparison<SEQ_F, false>,
+        &ARBDecompiler::FloatComparison<SLE_F, false>,
+        &ARBDecompiler::FloatComparison<SGT_F, false>,
+        &ARBDecompiler::FloatComparison<SNE_F, false>,
+        &ARBDecompiler::FloatComparison<SGE_F, false>,
+        &ARBDecompiler::FloatOrdered,
+        &ARBDecompiler::FloatUnordered,
+        &ARBDecompiler::FloatComparison<SLT_F, true>,
+        &ARBDecompiler::FloatComparison<SEQ_F, true>,
+        &ARBDecompiler::FloatComparison<SLE_F, true>,
+        &ARBDecompiler::FloatComparison<SGT_F, true>,
+        &ARBDecompiler::FloatComparison<SNE_F, true>,
+        &ARBDecompiler::FloatComparison<SGE_F, true>,
+
+        &ARBDecompiler::Binary<SLT_S>,
+        &ARBDecompiler::Binary<SEQ_S>,
+        &ARBDecompiler::Binary<SLE_S>,
+        &ARBDecompiler::Binary<SGT_S>,
+        &ARBDecompiler::Binary<SNE_S>,
+        &ARBDecompiler::Binary<SGE_S>,
+
+        &ARBDecompiler::Binary<SLT_U>,
+        &ARBDecompiler::Binary<SEQ_U>,
+        &ARBDecompiler::Binary<SLE_U>,
+        &ARBDecompiler::Binary<SGT_U>,
+        &ARBDecompiler::Binary<SNE_U>,
+        &ARBDecompiler::Binary<SGE_U>,
+
+        &ARBDecompiler::LogicalAddCarry,
+
+        &ARBDecompiler::HalfComparison<SLT_F, false>,
+        &ARBDecompiler::HalfComparison<SEQ_F, false>,
+        &ARBDecompiler::HalfComparison<SLE_F, false>,
+        &ARBDecompiler::HalfComparison<SGT_F, false>,
+        &ARBDecompiler::HalfComparison<SNE_F, false>,
+        &ARBDecompiler::HalfComparison<SGE_F, false>,
+        &ARBDecompiler::HalfComparison<SLT_F, true>,
+        &ARBDecompiler::HalfComparison<SEQ_F, true>,
+        &ARBDecompiler::HalfComparison<SLE_F, true>,
+        &ARBDecompiler::HalfComparison<SGT_F, true>,
+        &ARBDecompiler::HalfComparison<SNE_F, true>,
+        &ARBDecompiler::HalfComparison<SGE_F, true>,
+
+        &ARBDecompiler::Texture,
+        &ARBDecompiler::Texture,
+        &ARBDecompiler::TextureGather,
+        &ARBDecompiler::TextureQueryDimensions,
+        &ARBDecompiler::TextureQueryLod,
+        &ARBDecompiler::TexelFetch,
+        &ARBDecompiler::TextureGradient,
+
+        &ARBDecompiler::ImageLoad,
+        &ARBDecompiler::ImageStore,
+
+        &ARBDecompiler::AtomicImage<ADD, U32>,
+        &ARBDecompiler::AtomicImage<AND, U32>,
+        &ARBDecompiler::AtomicImage<OR, U32>,
+        &ARBDecompiler::AtomicImage<XOR, U32>,
+        &ARBDecompiler::AtomicImage<EXCH, U32>,
+
+        &ARBDecompiler::Atomic<EXCH, U32>,
+        &ARBDecompiler::Atomic<ADD, U32>,
+        &ARBDecompiler::Atomic<MIN, U32>,
+        &ARBDecompiler::Atomic<MAX, U32>,
+        &ARBDecompiler::Atomic<AND, U32>,
+        &ARBDecompiler::Atomic<OR, U32>,
+        &ARBDecompiler::Atomic<XOR, U32>,
+
+        &ARBDecompiler::Atomic<EXCH, S32>,
+        &ARBDecompiler::Atomic<ADD, S32>,
+        &ARBDecompiler::Atomic<MIN, S32>,
+        &ARBDecompiler::Atomic<MAX, S32>,
+        &ARBDecompiler::Atomic<AND, S32>,
+        &ARBDecompiler::Atomic<OR, S32>,
+        &ARBDecompiler::Atomic<XOR, S32>,
+
+        &ARBDecompiler::Atomic<ADD, U32>,
+        &ARBDecompiler::Atomic<MIN, U32>,
+        &ARBDecompiler::Atomic<MAX, U32>,
+        &ARBDecompiler::Atomic<AND, U32>,
+        &ARBDecompiler::Atomic<OR, U32>,
+        &ARBDecompiler::Atomic<XOR, U32>,
+
+        &ARBDecompiler::Atomic<ADD, S32>,
+        &ARBDecompiler::Atomic<MIN, S32>,
+        &ARBDecompiler::Atomic<MAX, S32>,
+        &ARBDecompiler::Atomic<AND, S32>,
+        &ARBDecompiler::Atomic<OR, S32>,
+        &ARBDecompiler::Atomic<XOR, S32>,
+
+        &ARBDecompiler::Branch,
+        &ARBDecompiler::BranchIndirect,
+        &ARBDecompiler::PushFlowStack,
+        &ARBDecompiler::PopFlowStack,
+        &ARBDecompiler::Exit,
+        &ARBDecompiler::Discard,
+
+        &ARBDecompiler::EmitVertex,
+        &ARBDecompiler::EndPrimitive,
+
+        &ARBDecompiler::InvocationId,
+        &ARBDecompiler::YNegate,
+        &ARBDecompiler::LocalInvocationId<'x'>,
+        &ARBDecompiler::LocalInvocationId<'y'>,
+        &ARBDecompiler::LocalInvocationId<'z'>,
+        &ARBDecompiler::WorkGroupId<'x'>,
+        &ARBDecompiler::WorkGroupId<'y'>,
+        &ARBDecompiler::WorkGroupId<'z'>,
+
+        &ARBDecompiler::Unary<TGBALLOT_U>,
+        &ARBDecompiler::Unary<TGALL_U>,
+        &ARBDecompiler::Unary<TGANY_U>,
+        &ARBDecompiler::Unary<TGEQ_U>,
+
+        &ARBDecompiler::ThreadId,
+        &ARBDecompiler::ThreadMask<'e', 'q'>,
+        &ARBDecompiler::ThreadMask<'g', 'e'>,
+        &ARBDecompiler::ThreadMask<'g', 't'>,
+        &ARBDecompiler::ThreadMask<'l', 'e'>,
+        &ARBDecompiler::ThreadMask<'l', 't'>,
+        &ARBDecompiler::ShuffleIndexed,
+
+        &ARBDecompiler::Barrier,
+        &ARBDecompiler::MemoryBarrierGroup,
+        &ARBDecompiler::MemoryBarrierGlobal,
+    };
+};
+
+ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+                             ShaderType stage, std::string_view identifier)
+    : device{device}, ir{ir}, registry{registry}, stage{stage} {
+    AddLine("TEMP RC;");
+    AddLine("TEMP FSWZA[4];");
+    AddLine("TEMP FSWZB[4];");
+    if (ir.IsDecompiled()) {
+        DecompileAST();
+    } else {
+        DecompileBranchMode();
+    }
+    AddLine("END");
+
+    const std::string code = std::move(shader_source);
+    DeclareHeader();
+    DeclareVertex();
+    DeclareGeometry();
+    DeclareFragment();
+    DeclareCompute();
+    DeclareInputAttributes();
+    DeclareOutputAttributes();
+    DeclareLocalMemory();
+    DeclareGlobalMemory();
+    DeclareConstantBuffers();
+    DeclareRegisters();
+    DeclareTemporaries();
+    DeclarePredicates();
+    DeclareInternalFlags();
+
+    shader_source += code;
+}
+
+std::string_view HeaderStageName(ShaderType stage) {
+    switch (stage) {
+    case ShaderType::Vertex:
+        return "vp";
+    case ShaderType::Geometry:
+        return "gp";
+    case ShaderType::Fragment:
+        return "fp";
+    case ShaderType::Compute:
+        return "cp";
+    default:
+        UNREACHABLE();
+        return "";
+    }
+}
+
+void ARBDecompiler::DeclareHeader() {
+    AddLine("!!NV{}5.0", HeaderStageName(stage));
+    // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D
+    AddLine("OPTION NV_internal;");
+    AddLine("OPTION NV_gpu_program_fp64;");
+    AddLine("OPTION NV_shader_storage_buffer;");
+    AddLine("OPTION NV_shader_thread_group;");
+    if (ir.UsesWarps() && device.HasWarpIntrinsics()) {
+        AddLine("OPTION NV_shader_thread_shuffle;");
+    }
+    if (stage == ShaderType::Vertex) {
+        if (device.HasNvViewportArray2()) {
+            AddLine("OPTION NV_viewport_array2;");
+        }
+    }
+    if (stage == ShaderType::Fragment) {
+        AddLine("OPTION ARB_draw_buffers;");
+    }
+    if (device.HasImageLoadFormatted()) {
+        AddLine("OPTION EXT_shader_image_load_formatted;");
+    }
+}
+
+void ARBDecompiler::DeclareVertex() {
+    if (stage != ShaderType::Vertex) {
+        return;
+    }
+    AddLine("OUTPUT result_clip[] = {{ result.clip[0..7] }};");
+}
+
+void ARBDecompiler::DeclareGeometry() {
+    if (stage != ShaderType::Geometry) {
+        return;
+    }
+    const auto& info = registry.GetGraphicsInfo();
+    const auto& header = ir.GetHeader();
+    AddLine("PRIMITIVE_IN {};", PrimitiveDescription(info.primitive_topology));
+    AddLine("PRIMITIVE_OUT {};", TopologyName(header.common3.output_topology));
+    AddLine("VERTICES_OUT {};", header.common4.max_output_vertices.Value());
+    AddLine("ATTRIB vertex_position = vertex.position;");
+}
+
+void ARBDecompiler::DeclareFragment() {
+    if (stage != ShaderType::Fragment) {
+        return;
+    }
+    AddLine("OUTPUT result_color7 = result.color[7];");
+    AddLine("OUTPUT result_color6 = result.color[6];");
+    AddLine("OUTPUT result_color5 = result.color[5];");
+    AddLine("OUTPUT result_color4 = result.color[4];");
+    AddLine("OUTPUT result_color3 = result.color[3];");
+    AddLine("OUTPUT result_color2 = result.color[2];");
+    AddLine("OUTPUT result_color1 = result.color[1];");
+    AddLine("OUTPUT result_color0 = result.color;");
+}
+
+void ARBDecompiler::DeclareCompute() {
+    if (stage != ShaderType::Compute) {
+        return;
+    }
+    const ComputeInfo& info = registry.GetComputeInfo();
+    AddLine("GROUP_SIZE {} {} {};", info.workgroup_size[0], info.workgroup_size[1],
+            info.workgroup_size[2]);
+    if (info.shared_memory_size_in_words > 0) {
+        const u32 size_in_bytes = info.shared_memory_size_in_words * 4;
+        AddLine("SHARED_MEMORY {};", size_in_bytes);
+        AddLine("SHARED shared_mem[] = {{program.sharedmem}};");
+    }
+}
+
+void ARBDecompiler::DeclareInputAttributes() {
+    if (stage == ShaderType::Compute) {
+        return;
+    }
+    const std::string_view stage_name = StageInputName(stage);
+    for (const auto attribute : ir.GetInputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+
+        std::string_view suffix;
+        if (stage == ShaderType::Fragment) {
+            const auto input_mode{ir.GetHeader().ps.GetPixelImap(index)};
+            if (input_mode == PixelImap::Unused) {
+                return;
+            }
+            suffix = GetInputFlags(input_mode);
+        }
+        AddLine("{}ATTRIB in_attr{}[] = {{ {}.attrib[{}..{}] }};", suffix, index, stage_name, index,
+                index);
+    }
+}
+
+void ARBDecompiler::DeclareOutputAttributes() {
+    if (stage == ShaderType::Compute) {
+        return;
+    }
+    for (const auto attribute : ir.GetOutputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+        AddLine("OUTPUT out_attr{}[] = {{ result.attrib[{}..{}] }};", index, index, index);
+    }
+}
+
+void ARBDecompiler::DeclareLocalMemory() {
+    u64 size = 0;
+    if (stage == ShaderType::Compute) {
+        size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL;
+    } else {
+        size = ir.GetHeader().GetLocalMemorySize();
+    }
+    if (size == 0) {
+        return;
+    }
+    const u64 element_count = Common::AlignUp(size, 4) / 4;
+    AddLine("TEMP lmem[{}];", element_count);
+}
+
+void ARBDecompiler::DeclareGlobalMemory() {
+    u32 binding = 0; // device.GetBaseBindings(stage).shader_storage_buffer;
+    for (const auto& pair : ir.GetGlobalMemory()) {
+        const auto& base = pair.first;
+        AddLine("STORAGE {}[] = {{ program.storage[{}] }};", GlobalMemoryName(base), binding);
+        ++binding;
+    }
+}
+
+void ARBDecompiler::DeclareConstantBuffers() {
+    u32 binding = 0;
+    for (const auto& cbuf : ir.GetConstantBuffers()) {
+        AddLine("CBUFFER cbuf{}[] = {{ program.buffer[{}] }};", cbuf.first, binding);
+        ++binding;
+    }
+}
+
+void ARBDecompiler::DeclareRegisters() {
+    for (const u32 gpr : ir.GetRegisters()) {
+        AddLine("TEMP R{};", gpr);
+    }
+}
+
+void ARBDecompiler::DeclareTemporaries() {
+    for (std::size_t i = 0; i < max_temporaries; ++i) {
+        AddLine("TEMP T{};", i);
+    }
+}
+
+void ARBDecompiler::DeclarePredicates() {
+    for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+        AddLine("TEMP P{};", static_cast<u64>(pred));
+    }
+}
+
+void ARBDecompiler::DeclareInternalFlags() {
+    for (const char* name : INTERNAL_FLAG_NAMES) {
+        AddLine("TEMP {};", name);
+    }
+}
+
+void ARBDecompiler::InitializeVariables() {
+    AddLine("MOV.F32 FSWZA[0], -1;");
+    AddLine("MOV.F32 FSWZA[1], 1;");
+    AddLine("MOV.F32 FSWZA[2], -1;");
+    AddLine("MOV.F32 FSWZA[3], 0;");
+    AddLine("MOV.F32 FSWZB[0], -1;");
+    AddLine("MOV.F32 FSWZB[1], -1;");
+    AddLine("MOV.F32 FSWZB[2], 1;");
+    AddLine("MOV.F32 FSWZB[3], -1;");
+
+    if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) {
+        AddLine("MOV.F result.position, {{0, 0, 0, 1}};");
+    }
+    for (const auto attribute : ir.GetOutputAttributes()) {
+        if (!IsGenericAttribute(attribute)) {
+            continue;
+        }
+        const u32 index = GetGenericAttributeIndex(attribute);
+        AddLine("MOV.F result.attrib[{}], {{0, 0, 0, 1}};", index);
+    }
+    for (const u32 gpr : ir.GetRegisters()) {
+        AddLine("MOV.F R{}, {{0, 0, 0, 0}};", gpr);
+    }
+    for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+        AddLine("MOV.U P{}, {{0, 0, 0, 0}};", static_cast<u64>(pred));
+    }
+}
+
+void ARBDecompiler::DecompileAST() {
+    const u32 num_flow_variables = ir.GetASTNumVariables();
+    for (u32 i = 0; i < num_flow_variables; ++i) {
+        AddLine("TEMP F{};", i);
+    }
+    for (u32 i = 0; i < num_flow_variables; ++i) {
+        AddLine("MOV.U F{}, {{0, 0, 0, 0}};", i);
+    }
+
+    InitializeVariables();
+
+    VisitAST(ir.GetASTProgram());
+}
+
+void ARBDecompiler::DecompileBranchMode() {
+    static constexpr u32 FLOW_STACK_SIZE = 20;
+    if (!ir.IsFlowStackDisabled()) {
+        AddLine("TEMP SSY[{}];", FLOW_STACK_SIZE);
+        AddLine("TEMP PBK[{}];", FLOW_STACK_SIZE);
+        AddLine("TEMP SSY_TOP;");
+        AddLine("TEMP PBK_TOP;");
+    }
+
+    AddLine("TEMP PC;");
+
+    if (!ir.IsFlowStackDisabled()) {
+        AddLine("MOV.U SSY_TOP.x, 0;");
+        AddLine("MOV.U PBK_TOP.x, 0;");
+    }
+
+    InitializeVariables();
+
+    const auto basic_block_end = ir.GetBasicBlocks().end();
+    auto basic_block_it = ir.GetBasicBlocks().begin();
+    const u32 first_address = basic_block_it->first;
+    AddLine("MOV.U PC.x, {};", first_address);
+
+    AddLine("REP;");
+
+    std::size_t num_blocks = 0;
+    while (basic_block_it != basic_block_end) {
+        const auto& [address, bb] = *basic_block_it;
+        ++num_blocks;
+
+        AddLine("SEQ.S.CC RC.x, PC.x, {};", address);
+        AddLine("IF NE.x;");
+
+        VisitBlock(bb);
+
+        ++basic_block_it;
+
+        if (basic_block_it != basic_block_end) {
+            const auto op = std::get_if<OperationNode>(&*bb[bb.size() - 1]);
+            if (!op || op->GetCode() != OperationCode::Branch) {
+                const u32 next_address = basic_block_it->first;
+                AddLine("MOV.U PC.x, {};", next_address);
+                AddLine("CONT;");
+            }
+        }
+
+        AddLine("ELSE;");
+    }
+    AddLine("RET;");
+    while (num_blocks--) {
+        AddLine("ENDIF;");
+    }
+
+    AddLine("ENDREP;");
+}
+
+void ARBDecompiler::VisitAST(const ASTNode& node) {
+    if (const auto ast = std::get_if<ASTProgram>(&*node->GetInnerData())) {
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+    } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) {
+        const std::string condition = VisitExpression(ast->condition);
+        ResetTemporaries();
+
+        AddLine("MOVC.U RC.x, {};", condition);
+        AddLine("IF NE.x;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+        AddLine("ENDIF;");
+    } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) {
+        AddLine("ELSE;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+    } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) {
+        VisitBlock(ast->nodes);
+    } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) {
+        AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition));
+        ResetTemporaries();
+    } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) {
+        const std::string condition = VisitExpression(ast->condition);
+        ResetTemporaries();
+        AddLine("REP;");
+        for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+            VisitAST(current);
+        }
+        AddLine("MOVC.U RC.x, {};", condition);
+        AddLine("BRK (NE.x);");
+        AddLine("ENDREP;");
+    } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) {
+        const bool is_true = ExprIsTrue(ast->condition);
+        if (!is_true) {
+            AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+            AddLine("IF NE.x;");
+            ResetTemporaries();
+        }
+        if (ast->kills) {
+            AddLine("KIL TR;");
+        } else {
+            Exit();
+        }
+        if (!is_true) {
+            AddLine("ENDIF;");
+        }
+    } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) {
+        if (ExprIsTrue(ast->condition)) {
+            AddLine("BRK;");
+        } else {
+            AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+            AddLine("BRK (NE.x);");
+            ResetTemporaries();
+        }
+    } else if (std::holds_alternative<ASTLabel>(*node->GetInnerData())) {
+        // Nothing to do
+    } else {
+        UNREACHABLE();
+    }
+}
+
+std::string ARBDecompiler::VisitExpression(const Expr& node) {
+    const std::string result = AllocTemporary();
+    if (const auto expr = std::get_if<ExprAnd>(&*node)) {
+        AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1),
+                VisitExpression(expr->operand2));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprOr>(&*node)) {
+        const std::string result = AllocTemporary();
+        AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1),
+                VisitExpression(expr->operand2));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprNot>(&*node)) {
+        const std::string result = AllocTemporary();
+        AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1));
+        return result;
+    }
+    if (const auto expr = std::get_if<ExprPredicate>(&*node)) {
+        return fmt::format("P{}.x", static_cast<u64>(expr->predicate));
+    }
+    if (const auto expr = std::get_if<ExprCondCode>(&*node)) {
+        return Visit(ir.GetConditionCode(expr->cc));
+    }
+    if (const auto expr = std::get_if<ExprVar>(&*node)) {
+        return fmt::format("F{}.x", expr->var_index);
+    }
+    if (const auto expr = std::get_if<ExprBoolean>(&*node)) {
+        return expr->value ? "0xffffffff" : "0";
+    }
+    if (const auto expr = std::get_if<ExprGprEqual>(&*node)) {
+        const std::string result = AllocTemporary();
+        AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value);
+        return result;
+    }
+    UNREACHABLE();
+    return "0";
+}
+
+void ARBDecompiler::VisitBlock(const NodeBlock& bb) {
+    for (const auto& node : bb) {
+        Visit(node);
+    }
+}
+
+std::string ARBDecompiler::Visit(const Node& node) {
+    if (const auto operation = std::get_if<OperationNode>(&*node)) {
+        if (const auto amend_index = operation->GetAmendIndex()) {
+            Visit(ir.GetAmendNode(*amend_index));
+        }
+        const std::size_t index = static_cast<std::size_t>(operation->GetCode());
+        if (index >= OPERATION_DECOMPILERS.size()) {
+            UNREACHABLE_MSG("Out of bounds operation: {}", index);
+            return {};
+        }
+        const auto decompiler = OPERATION_DECOMPILERS[index];
+        if (decompiler == nullptr) {
+            UNREACHABLE_MSG("Undefined operation: {}", index);
+            return {};
+        }
+        return (this->*decompiler)(*operation);
+    }
+
+    if (const auto gpr = std::get_if<GprNode>(&*node)) {
+        const u32 index = gpr->GetIndex();
+        if (index == Register::ZeroIndex) {
+            return "{0, 0, 0, 0}.x";
+        }
+        return fmt::format("R{}.x", index);
+    }
+
+    if (const auto cv = std::get_if<CustomVarNode>(&*node)) {
+        return fmt::format("CV{}.x", cv->GetIndex());
+    }
+
+    if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
+        const std::string temporary = AllocTemporary();
+        AddLine("MOV.U {}, {};", temporary, immediate->GetValue());
+        return temporary;
+    }
+
+    if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
+        const std::string temporary = AllocTemporary();
+        switch (const auto index = predicate->GetIndex(); index) {
+        case Tegra::Shader::Pred::UnusedIndex:
+            AddLine("MOV.S {}, -1;", temporary);
+            break;
+        case Tegra::Shader::Pred::NeverExecute:
+            AddLine("MOV.S {}, 0;", temporary);
+            break;
+        default:
+            AddLine("MOV.S {}, P{}.x;", temporary, static_cast<u64>(index));
+            break;
+        }
+        if (predicate->IsNegated()) {
+            AddLine("CMP.S {}, {}, 0, -1;", temporary, temporary);
+        }
+        return temporary;
+    }
+
+    if (const auto abuf = std::get_if<AbufNode>(&*node)) {
+        if (abuf->IsPhysicalBuffer()) {
+            UNIMPLEMENTED_MSG("Physical buffers are not implemented");
+            return "{0, 0, 0, 0}.x";
+        }
+
+        const auto buffer_index = [this, &abuf]() -> std::string {
+            if (stage != ShaderType::Geometry) {
+                return "";
+            }
+            return fmt::format("[{}]", Visit(abuf->GetBuffer()));
+        };
+
+        const Attribute::Index index = abuf->GetIndex();
+        const u32 element = abuf->GetElement();
+        const char swizzle = Swizzle(element);
+        switch (index) {
+        case Attribute::Index::Position: {
+            if (stage == ShaderType::Geometry) {
+                return fmt::format("{}_position[{}].{}", StageInputName(stage),
+                                   Visit(abuf->GetBuffer()), swizzle);
+            } else {
+                return fmt::format("{}.position.{}", StageInputName(stage), swizzle);
+            }
+        }
+        case Attribute::Index::TessCoordInstanceIDVertexID:
+            ASSERT(stage == ShaderType::Vertex);
+            switch (element) {
+            case 2:
+                return "vertex.instance";
+            case 3:
+                return "vertex.id";
+            }
+            UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
+            break;
+        case Attribute::Index::PointCoord:
+            switch (element) {
+            case 0:
+                return "fragment.pointcoord.x";
+            case 1:
+                return "fragment.pointcoord.y";
+            }
+            UNIMPLEMENTED();
+            break;
+        case Attribute::Index::FrontFacing: {
+            ASSERT(stage == ShaderType::Fragment);
+            ASSERT(element == 3);
+            const std::string temporary = AllocVectorTemporary();
+            AddLine("SGT.S RC.x, fragment.facing, {{0, 0, 0, 0}};");
+            AddLine("MOV.U.CC RC.x, -RC;");
+            AddLine("MOV.S {}.x, 0;", temporary);
+            AddLine("MOV.S {}.x (NE.x), -1;", temporary);
+            return fmt::format("{}.x", temporary);
+        }
+        default:
+            if (IsGenericAttribute(index)) {
+                if (stage == ShaderType::Geometry) {
+                    return fmt::format("in_attr{}[{}][0].{}", GetGenericAttributeIndex(index),
+                                       Visit(abuf->GetBuffer()), swizzle);
+                } else {
+                    return fmt::format("{}.attrib[{}].{}", StageInputName(stage),
+                                       GetGenericAttributeIndex(index), swizzle);
+                }
+            }
+            UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index));
+            break;
+        }
+        return "{0, 0, 0, 0}.x";
+    }
+
+    if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
+        std::string offset_string;
+        const auto& offset = cbuf->GetOffset();
+        if (const auto imm = std::get_if<ImmediateNode>(&*offset)) {
+            offset_string = std::to_string(imm->GetValue());
+        } else {
+            offset_string = Visit(offset);
+        }
+        const std::string temporary = AllocTemporary();
+        AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string);
+        return temporary;
+    }
+
+    if (const auto gmem = std::get_if<GmemNode>(&*node)) {
+        const std::string temporary = AllocTemporary();
+        AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+                Visit(gmem->GetBaseAddress()));
+        AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()),
+                temporary);
+        return temporary;
+    }
+
+    if (const auto lmem = std::get_if<LmemNode>(&*node)) {
+        const std::string temporary = Visit(lmem->GetAddress());
+        AddLine("SHR.U {}, {}, 2;", temporary, temporary);
+        AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary);
+        return temporary;
+    }
+
+    if (const auto smem = std::get_if<SmemNode>(&*node)) {
+        const std::string temporary = Visit(smem->GetAddress());
+        AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary);
+        return temporary;
+    }
+
+    if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
+        const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+        return fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+    }
+
+    if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+        if (const auto amend_index = conditional->GetAmendIndex()) {
+            Visit(ir.GetAmendNode(*amend_index));
+        }
+        AddLine("MOVC.U RC.x, {};", Visit(conditional->GetCondition()));
+        AddLine("IF NE.x;");
+        VisitBlock(conditional->GetCode());
+        AddLine("ENDIF;");
+        return {};
+    }
+
+    if (const auto cmt = std::get_if<CommentNode>(&*node)) {
+        // Uncommenting this will generate invalid code. GLASM lacks comments.
+        // AddLine("// {}", cmt->GetText());
+        return {};
+    }
+
+    UNIMPLEMENTED();
+    return {};
+}
+
+std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    UNIMPLEMENTED_IF(meta.sampler.is_indexed);
+    UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array &&
+                     meta.sampler.type == Tegra::Shader::TextureType::TextureCube);
+
+    const std::size_t count = operation.GetOperandsCount();
+    std::string temporary = AllocVectorTemporary();
+    std::size_t i = 0;
+    for (; i < count; ++i) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    if (meta.sampler.is_array) {
+        AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array));
+    }
+    if (meta.sampler.is_shadow) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare));
+    }
+    return {std::move(temporary), i};
+}
+
+std::string ARBDecompiler::BuildAoffi(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    if (meta.aoffi.empty()) {
+        return {};
+    }
+    const std::string temporary = AllocVectorTemporary();
+    std::size_t i = 0;
+    for (auto& node : meta.aoffi) {
+        AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i++), Visit(node));
+    }
+    return fmt::format(", offset({})", temporary);
+}
+
+void ARBDecompiler::Exit() {
+    if (stage != ShaderType::Fragment) {
+        AddLine("RET;");
+        return;
+    }
+
+    const auto safe_get_register = [this](u32 reg) -> std::string {
+        // TODO(Rodrigo): Replace with contains once C++20 releases
+        const auto& used_registers = ir.GetRegisters();
+        if (used_registers.find(reg) != used_registers.end()) {
+            return fmt::format("R{}.x", reg);
+        }
+        return "{0, 0, 0, 0}.x";
+    };
+
+    const auto& header = ir.GetHeader();
+    u32 current_reg = 0;
+    for (u32 rt = 0; rt < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++rt) {
+        for (u32 component = 0; component < 4; ++component) {
+            if (!header.ps.IsColorComponentOutputEnabled(rt, component)) {
+                continue;
+            }
+            AddLine("MOV.F result_color{}.{}, {};", rt, Swizzle(component),
+                    safe_get_register(current_reg));
+            ++current_reg;
+        }
+    }
+    if (header.ps.omap.depth) {
+        AddLine("MOV.F result.depth.z, {};", safe_get_register(current_reg + 1));
+    }
+
+    AddLine("RET;");
+}
+
+std::string ARBDecompiler::Assign(Operation operation) {
+    const Node& dest = operation[0];
+    const Node& src = operation[1];
+
+    std::string dest_name;
+    if (const auto gpr = std::get_if<GprNode>(&*dest)) {
+        if (gpr->GetIndex() == Register::ZeroIndex) {
+            // Writing to Register::ZeroIndex is a no op
+            return {};
+        }
+        dest_name = fmt::format("R{}.x", gpr->GetIndex());
+    } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
+        const u32 element = abuf->GetElement();
+        const char swizzle = Swizzle(element);
+        switch (const Attribute::Index index = abuf->GetIndex()) {
+        case Attribute::Index::Position:
+            dest_name = fmt::format("result.position.{}", swizzle);
+            break;
+        case Attribute::Index::LayerViewportPointSize:
+            switch (element) {
+            case 0:
+                UNIMPLEMENTED();
+                return {};
+            case 1:
+            case 2:
+                if (!device.HasNvViewportArray2()) {
+                    LOG_ERROR(
+                        Render_OpenGL,
+                        "NV_viewport_array2 is missing. Maxwell gen 2 or better is required.");
+                    return {};
+                }
+                dest_name = element == 1 ? "result.layer.x" : "result.viewport.x";
+                break;
+            case 3:
+                dest_name = "result.pointsize.x";
+                break;
+            }
+            break;
+        case Attribute::Index::ClipDistances0123:
+            dest_name = fmt::format("result.clip[{}].x", element);
+            break;
+        case Attribute::Index::ClipDistances4567:
+            dest_name = fmt::format("result.clip[{}].x", element + 4);
+            break;
+        default:
+            if (!IsGenericAttribute(index)) {
+                UNREACHABLE();
+                return {};
+            }
+            dest_name =
+                fmt::format("result.attrib[{}].{}", GetGenericAttributeIndex(index), swizzle);
+            break;
+        }
+    } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
+        const std::string address = Visit(lmem->GetAddress());
+        AddLine("SHR.U {}, {}, 2;", address, address);
+        dest_name = fmt::format("lmem[{}].x", address);
+    } else if (const auto smem = std::get_if<SmemNode>(&*dest)) {
+        AddLine("STS.U32 {}, shared_mem[{}];", Visit(src), Visit(smem->GetAddress()));
+        ResetTemporaries();
+        return {};
+    } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
+        const std::string temporary = AllocTemporary();
+        AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+                Visit(gmem->GetBaseAddress()));
+        AddLine("STB.U32 {}, {}[{}];", Visit(src), GlobalMemoryName(gmem->GetDescriptor()),
+                temporary);
+        ResetTemporaries();
+        return {};
+    } else {
+        UNREACHABLE();
+        ResetTemporaries();
+        return {};
+    }
+
+    AddLine("MOV.U {}, {};", dest_name, Visit(src));
+    ResetTemporaries();
+    return {};
+}
+
+std::string ARBDecompiler::Select(Operation operation) {
+    const std::string temporary = AllocTemporary();
+    AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]),
+            Visit(operation[2]));
+    return temporary;
+}
+
+std::string ARBDecompiler::FClamp(Operation operation) {
+    // 1.0f in hex, replace with std::bit_cast on C++20
+    static constexpr u32 POSITIVE_ONE = 0x3f800000;
+
+    const std::string temporary = AllocTemporary();
+    const Node& value = operation[0];
+    const Node& low = operation[1];
+    const Node& high = operation[2];
+    const auto imm_low = std::get_if<ImmediateNode>(&*low);
+    const auto imm_high = std::get_if<ImmediateNode>(&*high);
+    if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) {
+        AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value));
+    } else {
+        AddLine("MIN.F {}, {}, {};", temporary, Visit(value), Visit(high));
+        AddLine("MAX.F {}, {}, {};", temporary, temporary, Visit(low));
+    }
+    return temporary;
+}
+
+std::string ARBDecompiler::FCastHalf0(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.x, {};", temporary, Visit(operation[0]));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FCastHalf1(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.y, {};", temporary, Visit(operation[0]));
+    AddLine("MOV {}.x, {}.y;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FSqrt(Operation operation) {
+    const std::string temporary = AllocTemporary();
+    AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0]));
+    AddLine("RCP.F32 {}, {};", temporary, temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::FSwizzleAdd(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    if (!device.HasWarpIntrinsics()) {
+        LOG_ERROR(Render_OpenGL,
+                  "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+        AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]));
+        return fmt::format("{}.x", temporary);
+    }
+    const std::string lut = AllocVectorTemporary();
+    AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage));
+    AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary);
+    AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary);
+    AddLine("AND.U {}.z, {}.z, 3;", temporary, temporary);
+    AddLine("MUL.F32 {}.x, {}, FSWZA[{}.z];", temporary, Visit(operation[0]), temporary);
+    AddLine("MUL.F32 {}.y, {}, FSWZB[{}.z];", temporary, Visit(operation[1]), temporary);
+    AddLine("ADD.F32 {}.x, {}.x, {}.y;", temporary, temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HAdd2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("ADD.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HMul2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("MUL.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HFma2(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    const std::string tmp3 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+    AddLine("UP2H.F {}.xy, {};", tmp3, Visit(operation[2]));
+    AddLine("MAD.F16 {}, {}, {}, {};", tmp1, tmp1, tmp2, tmp3);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HAbsolute(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("PK2H.F {}.x, |{}|;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HNegate(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("MOVC.S RC.x, {};", Visit(operation[1]));
+    AddLine("MOV.F {}.x (NE.x), -{}.x;", temporary, temporary);
+    AddLine("MOVC.S RC.x, {};", Visit(operation[2]));
+    AddLine("MOV.F {}.y (NE.x), -{}.y;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HClamp(Operation operation) {
+    const std::string tmp1 = AllocVectorTemporary();
+    const std::string tmp2 = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+    AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[1]));
+    AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+    AddLine("MAX.F {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[2]));
+    AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+    AddLine("MIN.F {}, {}, {};", tmp1, tmp1, tmp2);
+    AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+    return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HCastFloat(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("MOV.F {}.y, {{0, 0, 0, 0}};", temporary);
+    AddLine("MOV.F {}.x, {};", temporary, Visit(operation[0]));
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HUnpack(Operation operation) {
+    const std::string operand = Visit(operation[0]);
+    switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
+    case Tegra::Shader::HalfType::H0_H1:
+        return operand;
+    case Tegra::Shader::HalfType::F32: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("MOV.U {}.x, {};", temporary, operand);
+        AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    case Tegra::Shader::HalfType::H0_H0: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("UP2H.F {}.xy, {};", temporary, operand);
+        AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    case Tegra::Shader::HalfType::H1_H1: {
+        const std::string temporary = AllocVectorTemporary();
+        AddLine("UP2H.F {}.xy, {};", temporary, operand);
+        AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+        AddLine("PK2H.F {}.x, {};", temporary, temporary);
+        return fmt::format("{}.x", temporary);
+    }
+    }
+    UNREACHABLE();
+    return "{0, 0, 0, 0}.x";
+}
+
+std::string ARBDecompiler::HMergeF32(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH0(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+    AddLine("MOV.U {}.x, {}.z;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH1(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+    AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+    AddLine("MOV.U {}.y, {}.w;", temporary, temporary);
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HPack2(Operation operation) {
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("MOV.U {}.x, {};", temporary, Visit(operation[0]));
+    AddLine("MOV.U {}.y, {};", temporary, Visit(operation[1]));
+    AddLine("PK2H.F {}.x, {};", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::LogicalAssign(Operation operation) {
+    const Node& dest = operation[0];
+    const Node& src = operation[1];
+
+    std::string target;
+
+    if (const auto pred = std::get_if<PredicateNode>(&*dest)) {
+        ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment");
+
+        const Tegra::Shader::Pred index = pred->GetIndex();
+        switch (index) {
+        case Tegra::Shader::Pred::NeverExecute:
+        case Tegra::Shader::Pred::UnusedIndex:
+            // Writing to these predicates is a no-op
+            return {};
+        }
+        target = fmt::format("P{}.x", static_cast<u64>(index));
+    } else if (const auto internal_flag = std::get_if<InternalFlagNode>(&*dest)) {
+        const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+        target = fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+    } else {
+        UNREACHABLE();
+        ResetTemporaries();
+        return {};
+    }
+
+    AddLine("MOV.U {}, {};", target, Visit(src));
+    ResetTemporaries();
+    return {};
+}
+
+std::string ARBDecompiler::LogicalPick2(Operation operation) {
+    const std::string temporary = AllocTemporary();
+    const u32 index = std::get<ImmediateNode>(*operation[1]).GetValue();
+    AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index));
+    return temporary;
+}
+
+std::string ARBDecompiler::LogicalAnd2(Operation operation) {
+    const std::string temporary = AllocTemporary();
+    const std::string op = Visit(operation[0]);
+    AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op);
+    return temporary;
+}
+
+std::string ARBDecompiler::FloatOrdered(Operation operation) {
+    const std::string temporary = AllocTemporary();
+    AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+    AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+    AddLine("MOV.S {}, -1;", temporary);
+    AddLine("MOV.S {} (NAN.x), 0;", temporary);
+    AddLine("MOV.S {} (NAN.y), 0;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::FloatUnordered(Operation operation) {
+    const std::string temporary = AllocTemporary();
+    AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+    AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+    AddLine("MOV.S {}, 0;", temporary);
+    AddLine("MOV.S {} (NAN.x), -1;", temporary);
+    AddLine("MOV.S {} (NAN.y), -1;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
+    const std::string temporary = AllocTemporary();
+    AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1]));
+    AddLine("MOV.S {}, 0;", temporary);
+    AddLine("IF CF.x;");
+    AddLine("MOV.S {}, -1;", temporary);
+    AddLine("ENDIF;");
+    return temporary;
+}
+
+std::string ARBDecompiler::Texture(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [temporary, swizzle] = BuildCoords(operation);
+
+    std::string_view opcode = "TEX";
+    std::string extra;
+    if (meta.bias) {
+        ASSERT(!meta.lod);
+        opcode = "TXB";
+
+        if (swizzle < 4) {
+            AddLine("MOV.F {}.w, {};", temporary, Visit(meta.bias));
+        } else {
+            const std::string bias = AllocTemporary();
+            AddLine("MOV.F {}, {};", bias, Visit(meta.bias));
+            extra = fmt::format(" {},", bias);
+        }
+    }
+    if (meta.lod) {
+        ASSERT(!meta.bias);
+        opcode = "TXL";
+
+        if (swizzle < 4) {
+            AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+        } else {
+            const std::string lod = AllocTemporary();
+            AddLine("MOV.F {}, {};", lod, Visit(meta.lod));
+            extra = fmt::format(" {},", lod);
+        }
+    }
+
+    AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGather(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [temporary, swizzle] = BuildCoords(operation);
+
+    std::string comp;
+    if (!meta.sampler.is_shadow) {
+        const auto& immediate = std::get<ImmediateNode>(*meta.component);
+        comp = fmt::format(".{}", Swizzle(immediate.GetValue()));
+    }
+
+    AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryDimensions(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const std::string temporary = AllocVectorTemporary();
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+    ASSERT(!meta.sampler.is_array);
+
+    const std::string lod = operation.GetOperandsCount() > 0 ? Visit(operation[0]) : "0";
+    AddLine("TXQ {}, {}, texture[{}], {};", temporary, lod, sampler_id, TextureType(meta));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryLod(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const std::string temporary = AllocVectorTemporary();
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+    ASSERT(!meta.sampler.is_array);
+
+    const std::size_t count = operation.GetOperandsCount();
+    for (std::size_t i = 0; i < count; ++i) {
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    AddLine("LOD.F {}, {}, texture[{}], {};", temporary, temporary, sampler_id, TextureType(meta));
+    AddLine("MUL.F32 {}, {}, {{256, 256, 0, 0}};", temporary, temporary);
+    AddLine("TRUNC.S {}, {};", temporary, temporary);
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TexelFetch(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const auto [temporary, swizzle] = BuildCoords(operation);
+
+    if (!meta.sampler.is_buffer) {
+        ASSERT(swizzle < 4);
+        AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+    }
+    AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta),
+            BuildAoffi(operation));
+    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGradient(Operation operation) {
+    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+    const std::string ddx = AllocVectorTemporary();
+    const std::string ddy = AllocVectorTemporary();
+    const std::string coord = BuildCoords(operation).first;
+
+    const std::size_t num_components = meta.derivates.size() / 2;
+    for (std::size_t index = 0; index < num_components; ++index) {
+        const char swizzle = Swizzle(index);
+        AddLine("MOV.F {}.{}, {};", ddx, swizzle, Visit(meta.derivates[index * 2]));
+        AddLine("MOV.F {}.{}, {};", ddy, swizzle, Visit(meta.derivates[index * 2 + 1]));
+    }
+
+    const std::string_view result = coord;
+    AddLine("TXD.F {}, {}, {}, {}, texture[{}], {}{};", result, coord, ddx, ddy, sampler_id,
+            TextureType(meta), BuildAoffi(operation));
+    AddLine("MOV.F {}.x, {}.{};", result, result, Swizzle(meta.element));
+    return fmt::format("{}.x", result);
+}
+
+std::string ARBDecompiler::ImageLoad(Operation operation) {
+    const auto& meta = std::get<MetaImage>(operation.GetMeta());
+    const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+    const std::size_t count = operation.GetOperandsCount();
+    const std::string_view type = ImageType(meta.image.type);
+
+    const std::string temporary = AllocVectorTemporary();
+    for (std::size_t i = 0; i < count; ++i) {
+        AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+    }
+    AddLine("LOADIM.F {}, {}, image[{}], {};", temporary, temporary, image_id, type);
+    AddLine("MOV.F {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::ImageStore(Operation operation) {
+    const auto& meta = std::get<MetaImage>(operation.GetMeta());
+    const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+    const std::size_t num_coords = operation.GetOperandsCount();
+    const std::size_t num_values = meta.values.size();
+    const std::string_view type = ImageType(meta.image.type);
+
+    const std::string coord = AllocVectorTemporary();
+    const std::string value = AllocVectorTemporary();
+    for (std::size_t i = 0; i < num_coords; ++i) {
+        AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+    }
+    for (std::size_t i = 0; i < num_values; ++i) {
+        AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+    }
+    AddLine("STOREIM.F image[{}], {}, {}, {};", image_id, value, coord, type);
+    return {};
+}
+
+std::string ARBDecompiler::Branch(Operation operation) {
+    const auto target = std::get<ImmediateNode>(*operation[0]);
+    AddLine("MOV.U PC.x, {};", target.GetValue());
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::BranchIndirect(Operation operation) {
+    AddLine("MOV.U PC.x, {};", Visit(operation[0]));
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::PushFlowStack(Operation operation) {
+    const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+    const u32 target = std::get<ImmediateNode>(*operation[0]).GetValue();
+    const std::string_view stack_name = StackName(stack);
+    AddLine("MOV.U {}[{}_TOP.x].x, {};", stack_name, stack_name, target);
+    AddLine("ADD.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+    return {};
+}
+
+std::string ARBDecompiler::PopFlowStack(Operation operation) {
+    const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+    const std::string_view stack_name = StackName(stack);
+    AddLine("SUB.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+    AddLine("MOV.U PC.x, {}[{}_TOP.x].x;", stack_name, stack_name);
+    AddLine("CONT;");
+    return {};
+}
+
+std::string ARBDecompiler::Exit(Operation) {
+    Exit();
+    return {};
+}
+
+std::string ARBDecompiler::Discard(Operation) {
+    AddLine("KIL TR;");
+    return {};
+}
+
+std::string ARBDecompiler::EmitVertex(Operation) {
+    AddLine("EMIT;");
+    return {};
+}
+
+std::string ARBDecompiler::EndPrimitive(Operation) {
+    AddLine("ENDPRIM;");
+    return {};
+}
+
+std::string ARBDecompiler::InvocationId(Operation) {
+    return "primitive.invocation";
+}
+
+std::string ARBDecompiler::YNegate(Operation) {
+    LOG_WARNING(Render_OpenGL, "(STUBBED)");
+    const std::string temporary = AllocTemporary();
+    AddLine("MOV.F {}, 1;", temporary);
+    return temporary;
+}
+
+std::string ARBDecompiler::ThreadId(Operation) {
+    return fmt::format("{}.threadid", StageInputName(stage));
+}
+
+std::string ARBDecompiler::ShuffleIndexed(Operation operation) {
+    if (!device.HasWarpIntrinsics()) {
+        LOG_ERROR(Render_OpenGL,
+                  "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+        return Visit(operation[0]);
+    }
+    const std::string temporary = AllocVectorTemporary();
+    AddLine("SHFIDX.U {}, {}, {}, {{31, 0, 0, 0}};", temporary, Visit(operation[0]),
+            Visit(operation[1]));
+    AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+    return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::Barrier(Operation) {
+    if (!ir.IsDecompiled()) {
+        LOG_ERROR(Render_OpenGL, "BAR used but shader is not decompiled");
+        return {};
+    }
+    AddLine("BAR;");
+    return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGroup(Operation) {
+    AddLine("MEMBAR.CTA;");
+    return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGlobal(Operation) {
+    AddLine("MEMBAR;");
+    return {};
+}
+
+} // Anonymous namespace
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                                    const VideoCommon::Shader::Registry& registry,
+                                    Tegra::Engines::ShaderType stage, std::string_view identifier) {
+    return ARBDecompiler(device, ir, registry, stage, identifier).Code();
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.h b/src/video_core/renderer_opengl/gl_arb_decompiler.h
new file mode 100644
index 000000000..6afc87220
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <string>
+#include <string_view>
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+enum class ShaderType : u32;
+}
+
+namespace VideoCommon::Shader {
+class ShaderIR;
+class Registry;
+} // namespace VideoCommon::Shader
+
+namespace OpenGL {
+
+class Device;
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                                    const VideoCommon::Shader::Registry& registry,
+                                    Tegra::Engines::ShaderType stage, std::string_view identifier);
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 9964ea894..ad0577a4f 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -22,13 +22,12 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 
 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
 
-CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size)
-    : VideoCommon::BufferBlock{cpu_addr, size} {
+Buffer::Buffer(VAddr cpu_addr, const std::size_t size) : VideoCommon::BufferBlock{cpu_addr, size} {
     gl_buffer.Create();
     glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
 }
 
-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;
 
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                                const Device& device, std::size_t stream_size)
@@ -48,12 +47,8 @@ OGLBufferCache::~OGLBufferCache() {
     glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
 }
 
-Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<CachedBufferBlock>(cpu_addr, size);
-}
-
-GLuint OGLBufferCache::ToHandle(const Buffer& buffer) {
-    return buffer->GetHandle();
+std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(cpu_addr, size);
 }
 
 GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
@@ -62,7 +57,7 @@ GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
 
 void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
                                      const u8* data) {
-    glNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
+    glNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
                          static_cast<GLsizeiptr>(size), data);
 }
 
@@ -70,20 +65,20 @@ void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
                                        u8* data) {
     MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
     glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
-    glGetNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
+    glGetNamedBufferSubData(buffer.Handle(), static_cast<GLintptr>(offset),
                             static_cast<GLsizeiptr>(size), data);
 }
 
 void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
                                std::size_t dst_offset, std::size_t size) {
-    glCopyNamedBufferSubData(src->GetHandle(), dst->GetHandle(), static_cast<GLintptr>(src_offset),
+    glCopyNamedBufferSubData(src.Handle(), dst.Handle(), static_cast<GLintptr>(src_offset),
                              static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
 }
 
 OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
                                                              std::size_t size) {
     DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
-    const GLuint& cbuf = cbufs[cbuf_cursor++];
+    const GLuint cbuf = cbufs[cbuf_cursor++];
     glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
     return {cbuf, 0};
 }
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 679b9b1d7..a49aaf9c4 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -23,17 +23,12 @@ class Device;
 class OGLStreamBuffer;
 class RasterizerOpenGL;
 
-class CachedBufferBlock;
-
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
-
-class CachedBufferBlock : public VideoCommon::BufferBlock {
+class Buffer : public VideoCommon::BufferBlock {
 public:
-    explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size);
-    ~CachedBufferBlock();
+    explicit Buffer(VAddr cpu_addr, const std::size_t size);
+    ~Buffer();
 
-    GLuint GetHandle() const {
+    GLuint Handle() const {
         return gl_buffer.handle;
     }
 
@@ -41,6 +36,7 @@ private:
     OGLBuffer gl_buffer;
 };
 
+using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
 class OGLBufferCache final : public GenericBufferCache {
 public:
     explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
@@ -54,9 +50,7 @@ public:
     }
 
 protected:
-    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
-    GLuint ToHandle(const Buffer& buffer) override;
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
     void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
                          const u8* data) override;
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 890fc6c63..b31d604e4 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -123,16 +123,24 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
     u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
     u32 base_images = 0;
 
-    // Reserve more image bindings on fragment and vertex stages.
+    // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8.
+    // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the
+    // fragment stage, and at least 1 for the rest of the stages.
+    // So far games are observed to use 1 image binding on vertex and 4 on fragment stages.
+
+    // Reserve at least 4 image bindings on the fragment stage.
     bindings[4].image =
-        Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]);
-    bindings[0].image =
-        Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]);
+        Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]);
+
+    // This is guaranteed to be at least 1.
+    const u32 total_extracted_images = num_images / (NumStages - 1);
 
     // Reserve the other image bindings.
-    const u32 total_extracted_images = num_images / (NumStages - 2);
-    for (std::size_t i = 2; i < NumStages; ++i) {
+    for (std::size_t i = 0; i < NumStages; ++i) {
         const std::size_t stage = stage_swizzle[i];
+        if (stage == 4) {
+            continue;
+        }
         bindings[stage].image =
             Extract(base_images, num_images, total_extracted_images, LimitImages[stage]);
     }
@@ -213,6 +221,7 @@ Device::Device()
     has_component_indexing_bug = is_amd;
     has_precise_bug = TestPreciseBug();
     has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+    has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
     use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
                            GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
                            GLAD_GL_NV_transform_feedback2;
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 98cca0254..145347943 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -88,6 +88,10 @@ public:
         return has_fast_buffer_sub_data;
     }
 
+    bool HasNvViewportArray2() const {
+        return has_nv_viewport_array2;
+    }
+
     bool UseAssemblyShaders() const {
         return use_assembly_shaders;
     }
@@ -111,6 +115,7 @@ private:
     bool has_component_indexing_bug{};
     bool has_precise_bug{};
     bool has_fast_buffer_sub_data{};
+    bool has_nv_viewport_array2{};
     bool use_assembly_shaders{};
 };
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index a48cee1e5..2d6c11320 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -66,10 +66,22 @@ constexpr std::size_t NumSupportedVertexAttributes = 16;
 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
                                                ShaderType shader_type, std::size_t index = 0) {
+    if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+        if (entry.is_separated) {
+            const u32 buffer_1 = entry.buffer;
+            const u32 buffer_2 = entry.secondary_buffer;
+            const u32 offset_1 = entry.offset;
+            const u32 offset_2 = entry.secondary_offset;
+            const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1);
+            const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2);
+            return engine.GetTextureInfo(handle_1 | handle_2);
+        }
+    }
     if (entry.is_bindless) {
-        const auto tex_handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
-        return engine.GetTextureInfo(tex_handle);
+        const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
+        return engine.GetTextureInfo(handle);
     }
+
     const auto& gpu_profile = engine.AccessGuestDriverProfile();
     const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
     if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
@@ -605,7 +617,16 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
                    (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
 
     // Prepare the vertex array.
-    buffer_cache.Map(buffer_size);
+    const bool invalidated = buffer_cache.Map(buffer_size);
+
+    if (invalidated) {
+        // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty
+        auto& dirty = gpu.dirty.flags;
+        dirty[Dirty::VertexBuffers] = true;
+        for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
+            dirty[index] = true;
+        }
+    }
 
     // Prepare vertex array format.
     SetupVertexFormat();
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index c28486b1d..46e780a06 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -20,6 +20,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
@@ -148,7 +149,8 @@ ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 u
     auto program = std::make_shared<ProgramHandle>();
 
     if (device.UseAssemblyShaders()) {
-        const std::string arb = "Not implemented";
+        const std::string arb =
+            DecompileAssemblyShader(device, ir, registry, shader_type, shader_id);
 
         GLuint& arb_prog = program->assembly_program.handle;
 
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 9e95a122b..653c3f2f9 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap;
 
 namespace {
 
+using VideoCommon::Shader::SeparateSamplerKey;
+
 using ShaderCacheVersionHash = std::array<u8, 64>;
 
 struct ConstBufferKey {
@@ -37,18 +39,26 @@ struct ConstBufferKey {
     u32 value = 0;
 };
 
-struct BoundSamplerKey {
+struct BoundSamplerEntry {
     u32 offset = 0;
     Tegra::Engines::SamplerDescriptor sampler;
 };
 
-struct BindlessSamplerKey {
+struct SeparateSamplerEntry {
+    u32 cbuf1 = 0;
+    u32 cbuf2 = 0;
+    u32 offset1 = 0;
+    u32 offset2 = 0;
+    Tegra::Engines::SamplerDescriptor sampler;
+};
+
+struct BindlessSamplerEntry {
     u32 cbuf = 0;
     u32 offset = 0;
     Tegra::Engines::SamplerDescriptor sampler;
 };
 
-constexpr u32 NativeVersion = 20;
+constexpr u32 NativeVersion = 21;
 
 ShaderCacheVersionHash GetShaderCacheVersionHash() {
     ShaderCacheVersionHash hash{};
@@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
     u32 texture_handler_size_value;
     u32 num_keys;
     u32 num_bound_samplers;
+    u32 num_separate_samplers;
     u32 num_bindless_samplers;
     if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||
         file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||
         file.ReadArray(&texture_handler_size_value, 1) != 1 ||
         file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||
         file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 ||
+        file.ReadArray(&num_separate_samplers, 1) != 1 ||
         file.ReadArray(&num_bindless_samplers, 1) != 1) {
         return false;
     }
@@ -101,23 +113,32 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
     }
 
     std::vector<ConstBufferKey> flat_keys(num_keys);
-    std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers);
-    std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers);
+    std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers);
+    std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers);
+    std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers);
     if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||
         file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=
             flat_bound_samplers.size() ||
+        file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) !=
+            flat_separate_samplers.size() ||
         file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=
             flat_bindless_samplers.size()) {
         return false;
     }
-    for (const auto& key : flat_keys) {
-        keys.insert({{key.cbuf, key.offset}, key.value});
+    for (const auto& entry : flat_keys) {
+        keys.insert({{entry.cbuf, entry.offset}, entry.value});
     }
-    for (const auto& key : flat_bound_samplers) {
-        bound_samplers.emplace(key.offset, key.sampler);
+    for (const auto& entry : flat_bound_samplers) {
+        bound_samplers.emplace(entry.offset, entry.sampler);
     }
-    for (const auto& key : flat_bindless_samplers) {
-        bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
+    for (const auto& entry : flat_separate_samplers) {
+        SeparateSamplerKey key;
+        key.buffers = {entry.cbuf1, entry.cbuf2};
+        key.offsets = {entry.offset1, entry.offset2};
+        separate_samplers.emplace(key, entry.sampler);
+    }
+    for (const auto& entry : flat_bindless_samplers) {
+        bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler});
     }
 
     return true;
@@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
         file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||
         file.WriteObject(static_cast<u32>(keys.size())) != 1 ||
         file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 ||
         file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {
         return false;
     }
@@ -152,22 +174,34 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
         flat_keys.push_back(ConstBufferKey{address.first, address.second, value});
     }
 
-    std::vector<BoundSamplerKey> flat_bound_samplers;
+    std::vector<BoundSamplerEntry> flat_bound_samplers;
     flat_bound_samplers.reserve(bound_samplers.size());
     for (const auto& [address, sampler] : bound_samplers) {
-        flat_bound_samplers.push_back(BoundSamplerKey{address, sampler});
+        flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler});
+    }
+
+    std::vector<SeparateSamplerEntry> flat_separate_samplers;
+    flat_separate_samplers.reserve(separate_samplers.size());
+    for (const auto& [key, sampler] : separate_samplers) {
+        SeparateSamplerEntry entry;
+        std::tie(entry.cbuf1, entry.cbuf2) = key.buffers;
+        std::tie(entry.offset1, entry.offset2) = key.offsets;
+        entry.sampler = sampler;
+        flat_separate_samplers.push_back(entry);
     }
 
-    std::vector<BindlessSamplerKey> flat_bindless_samplers;
+    std::vector<BindlessSamplerEntry> flat_bindless_samplers;
     flat_bindless_samplers.reserve(bindless_samplers.size());
     for (const auto& [address, sampler] : bindless_samplers) {
         flat_bindless_samplers.push_back(
-            BindlessSamplerKey{address.first, address.second, sampler});
+            BindlessSamplerEntry{address.first, address.second, sampler});
     }
 
     return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&
            file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==
                flat_bound_samplers.size() &&
+           file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) ==
+               flat_separate_samplers.size() &&
            file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==
                flat_bindless_samplers.size();
 }
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index d5be52e40..a79cef0e9 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -57,6 +57,7 @@ struct ShaderDiskCacheEntry {
     VideoCommon::Shader::ComputeInfo compute_info;
     VideoCommon::Shader::KeyMap keys;
     VideoCommon::Shader::BoundSamplerMap bound_samplers;
+    VideoCommon::Shader::SeparateSamplerMap separate_samplers;
     VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
 };
 
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 6ec328c53..932a2f69e 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -49,14 +49,6 @@ OGLStreamBuffer::~OGLStreamBuffer() {
     gl_buffer.Release();
 }
 
-GLuint OGLStreamBuffer::GetHandle() const {
-    return gl_buffer.handle;
-}
-
-GLsizeiptr OGLStreamBuffer::GetSize() const {
-    return buffer_size;
-}
-
 std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
     ASSERT(size <= buffer_size);
     ASSERT(alignment <= buffer_size);
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index f8383cbd4..866da3594 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -17,9 +17,6 @@ public:
                              bool use_persistent = true);
     ~OGLStreamBuffer();
 
-    GLuint GetHandle() const;
-    GLsizeiptr GetSize() const;
-
     /*
      * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
      * and the optional alignment requirement.
@@ -32,6 +29,14 @@ public:
 
     void Unmap(GLsizeiptr size);
 
+    GLuint Handle() const {
+        return gl_buffer.handle;
+    }
+
+    GLsizeiptr Size() const {
+        return buffer_size;
+    }
+
 private:
     OGLBuffer gl_buffer;
 
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 994ae98eb..35e329240 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -46,10 +46,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_UNSIGNED_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_UNSIGNED_INT_2_10_10_10_REV;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
+        break;
     case Maxwell::VertexAttribute::Type::SignedInt:
     case Maxwell::VertexAttribute::Type::SignedNorm:
         switch (attrib.size) {
@@ -70,10 +68,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
             return GL_INT;
         case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
             return GL_INT_2_10_10_10_REV;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
+        break;
     case Maxwell::VertexAttribute::Type::Float:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_16:
@@ -86,10 +82,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_32_32_32:
         case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
             return GL_FLOAT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
+        break;
     case Maxwell::VertexAttribute::Type::UnsignedScaled:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
@@ -102,10 +96,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
             return GL_UNSIGNED_SHORT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
+        break;
     case Maxwell::VertexAttribute::Type::SignedScaled:
         switch (attrib.size) {
         case Maxwell::VertexAttribute::Size::Size_8:
@@ -118,14 +110,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
         case Maxwell::VertexAttribute::Size::Size_16_16_16:
         case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
             return GL_SHORT;
-        default:
-            LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
-            return {};
         }
-    default:
-        LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
-        return {};
+        break;
     }
+    UNIMPLEMENTED_MSG("Unimplemented vertex type={} and size={}", attrib.TypeString(),
+                      attrib.SizeString());
+    return {};
 }
 
 inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
@@ -137,8 +127,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
     case Maxwell::IndexFormat::UnsignedInt:
         return GL_UNSIGNED_INT;
     }
-    LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format));
-    UNREACHABLE();
+    UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format));
     return {};
 }
 
@@ -180,33 +169,32 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
 }
 
 inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
-                                Tegra::Texture::TextureMipmapFilter mip_filter_mode) {
+                                Tegra::Texture::TextureMipmapFilter mipmap_filter_mode) {
     switch (filter_mode) {
-    case Tegra::Texture::TextureFilter::Linear: {
-        switch (mip_filter_mode) {
+    case Tegra::Texture::TextureFilter::Nearest:
+        switch (mipmap_filter_mode) {
         case Tegra::Texture::TextureMipmapFilter::None:
-            return GL_LINEAR;
+            return GL_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_LINEAR_MIPMAP_NEAREST;
+            return GL_NEAREST_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_LINEAR_MIPMAP_LINEAR;
+            return GL_NEAREST_MIPMAP_LINEAR;
         }
         break;
-    }
-    case Tegra::Texture::TextureFilter::Nearest: {
-        switch (mip_filter_mode) {
+    case Tegra::Texture::TextureFilter::Linear:
+        switch (mipmap_filter_mode) {
         case Tegra::Texture::TextureMipmapFilter::None:
-            return GL_NEAREST;
+            return GL_LINEAR;
         case Tegra::Texture::TextureMipmapFilter::Nearest:
-            return GL_NEAREST_MIPMAP_NEAREST;
+            return GL_LINEAR_MIPMAP_NEAREST;
         case Tegra::Texture::TextureMipmapFilter::Linear:
-            return GL_NEAREST_MIPMAP_LINEAR;
+            return GL_LINEAR_MIPMAP_LINEAR;
         }
         break;
     }
-    }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast<u32>(filter_mode));
-    return GL_LINEAR;
+    UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}",
+                    static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode));
+    return GL_NEAREST;
 }
 
 inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
@@ -229,10 +217,9 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
         } else {
             return GL_MIRROR_CLAMP_TO_EDGE;
         }
-    default:
-        LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
-        return GL_REPEAT;
     }
+    UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
+    return GL_REPEAT;
 }
 
 inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
@@ -254,8 +241,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
     case Tegra::Texture::DepthCompareFunc::Always:
         return GL_ALWAYS;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}",
-              static_cast<u32>(func));
+    UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func));
     return GL_GREATER;
 }
 
@@ -277,7 +263,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
     case Maxwell::Blend::Equation::MaxGL:
         return GL_MAX;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation));
+    UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation));
     return GL_FUNC_ADD;
 }
 
@@ -341,7 +327,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
     case Maxwell::Blend::Factor::OneMinusConstantAlphaGL:
         return GL_ONE_MINUS_CONSTANT_ALPHA;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor));
+    UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor));
     return GL_ZERO;
 }
 
@@ -361,7 +347,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) {
     case Tegra::Texture::SwizzleSource::OneFloat:
         return GL_ONE;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source));
+    UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source));
     return GL_ZERO;
 }
 
@@ -392,7 +378,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) {
     case Maxwell::ComparisonOp::AlwaysOld:
         return GL_ALWAYS;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison));
+    UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison));
     return GL_ALWAYS;
 }
 
@@ -423,7 +409,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) {
     case Maxwell::StencilOp::DecrWrapOGL:
         return GL_DECR_WRAP;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil));
+    UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil));
     return GL_KEEP;
 }
 
@@ -434,7 +420,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) {
     case Maxwell::FrontFace::CounterClockWise:
         return GL_CCW;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face));
+    UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face));
     return GL_CCW;
 }
 
@@ -447,7 +433,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) {
     case Maxwell::CullFace::FrontAndBack:
         return GL_FRONT_AND_BACK;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face));
+    UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face));
     return GL_BACK;
 }
 
@@ -486,7 +472,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) {
     case Maxwell::LogicOperation::Set:
         return GL_SET;
     }
-    LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation));
+    UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation));
     return GL_COPY;
 }
 
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 62e950d31..1f2b6734b 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -21,29 +21,29 @@ namespace Sampler {
 
 VkFilter Filter(Tegra::Texture::TextureFilter filter) {
     switch (filter) {
-    case Tegra::Texture::TextureFilter::Linear:
-        return VK_FILTER_LINEAR;
     case Tegra::Texture::TextureFilter::Nearest:
         return VK_FILTER_NEAREST;
+    case Tegra::Texture::TextureFilter::Linear:
+        return VK_FILTER_LINEAR;
     }
-    UNIMPLEMENTED_MSG("Unimplemented sampler filter={}", static_cast<u32>(filter));
+    UNREACHABLE_MSG("Invalid sampler filter={}", static_cast<u32>(filter));
     return {};
 }
 
 VkSamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter) {
     switch (mipmap_filter) {
     case Tegra::Texture::TextureMipmapFilter::None:
-        // TODO(Rodrigo): None seems to be mapped to OpenGL's mag and min filters without mipmapping
-        // (e.g. GL_NEAREST and GL_LINEAR). Vulkan doesn't have such a thing, find out if we have to
-        // use an image view with a single mipmap level to emulate this.
-        return VK_SAMPLER_MIPMAP_MODE_LINEAR;
-        ;
-    case Tegra::Texture::TextureMipmapFilter::Linear:
-        return VK_SAMPLER_MIPMAP_MODE_LINEAR;
+        // There are no Vulkan filter modes that directly correspond to OpenGL minification filters
+        // of GL_LINEAR or GL_NEAREST, but they can be emulated using
+        // VK_SAMPLER_MIPMAP_MODE_NEAREST, minLod = 0, and maxLod = 0.25, and using minFilter =
+        // VK_FILTER_LINEAR or minFilter = VK_FILTER_NEAREST, respectively.
+        return VK_SAMPLER_MIPMAP_MODE_NEAREST;
     case Tegra::Texture::TextureMipmapFilter::Nearest:
         return VK_SAMPLER_MIPMAP_MODE_NEAREST;
+    case Tegra::Texture::TextureMipmapFilter::Linear:
+        return VK_SAMPLER_MIPMAP_MODE_LINEAR;
     }
-    UNIMPLEMENTED_MSG("Unimplemented sampler mipmap mode={}", static_cast<u32>(mipmap_filter));
+    UNREACHABLE_MSG("Invalid sampler mipmap mode={}", static_cast<u32>(mipmap_filter));
     return {};
 }
 
@@ -78,10 +78,9 @@ VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode w
     case Tegra::Texture::WrapMode::MirrorOnceBorder:
         UNIMPLEMENTED();
         return VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
-        return {};
     }
+    UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
+    return {};
 }
 
 VkCompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func) {
@@ -288,10 +287,9 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device,
         return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
     case Maxwell::PrimitiveTopology::Patches:
         return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
-        return {};
     }
+    UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
+    return {};
 }
 
 VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) {
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 5f33d9e40..1fde38328 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -37,8 +37,8 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch
 
 } // Anonymous namespace
 
-CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
-                                     VAddr cpu_addr, std::size_t size)
+Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr,
+               std::size_t size)
     : VideoCommon::BufferBlock{cpu_addr, size} {
     VkBufferCreateInfo ci;
     ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
@@ -54,7 +54,7 @@ CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& me
     buffer.commit = memory_manager.Commit(buffer.handle, false);
 }
 
-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;
 
 VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
                              const VKDevice& device, VKMemoryManager& memory_manager,
@@ -67,12 +67,8 @@ VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::S
 
 VKBufferCache::~VKBufferCache() = default;
 
-Buffer VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<CachedBufferBlock>(device, memory_manager, cpu_addr, size);
-}
-
-VkBuffer VKBufferCache::ToHandle(const Buffer& buffer) {
-    return buffer->GetHandle();
+std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, memory_manager, cpu_addr, size);
 }
 
 VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) {
@@ -91,7 +87,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
     std::memcpy(staging.commit->Map(size), data, size);
 
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset,
+    scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,
                       size](vk::CommandBuffer cmdbuf) {
         cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size});
 
@@ -114,7 +110,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
                                       u8* data) {
     const auto& staging = staging_pool.GetUnusedBuffer(size, true);
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset,
+    scheduler.Record([staging = *staging.handle, buffer = buffer.Handle(), offset,
                       size](vk::CommandBuffer cmdbuf) {
         VkBufferMemoryBarrier barrier;
         barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -141,8 +137,8 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
 void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
                               std::size_t dst_offset, std::size_t size) {
     scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([src_buffer = src->GetHandle(), dst_buffer = dst->GetHandle(), src_offset,
-                      dst_offset, size](vk::CommandBuffer cmdbuf) {
+    scheduler.Record([src_buffer = src.Handle(), dst_buffer = dst.Handle(), src_offset, dst_offset,
+                      size](vk::CommandBuffer cmdbuf) {
         cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size});
 
         std::array<VkBufferMemoryBarrier, 2> barriers;
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 65cb3c8ad..9ebbef835 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -23,13 +23,13 @@ class VKDevice;
 class VKMemoryManager;
 class VKScheduler;
 
-class CachedBufferBlock final : public VideoCommon::BufferBlock {
+class Buffer final : public VideoCommon::BufferBlock {
 public:
-    explicit CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
-                               VAddr cpu_addr, std::size_t size);
-    ~CachedBufferBlock();
+    explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VAddr cpu_addr,
+                    std::size_t size);
+    ~Buffer();
 
-    VkBuffer GetHandle() const {
+    VkBuffer Handle() const {
         return *buffer.handle;
     }
 
@@ -37,8 +37,6 @@ private:
     VKBuffer buffer;
 };
 
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-
 class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> {
 public:
     explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
@@ -49,9 +47,7 @@ public:
     VkBuffer GetEmptyBuffer(std::size_t size) override;
 
 protected:
-    VkBuffer ToHandle(const Buffer& buffer) override;
-
-    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
 
     void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
                          const u8* data) override;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 3170c41f8..184b2238a 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -118,6 +118,17 @@ template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
                                                std::size_t stage, std::size_t index = 0) {
     const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage);
+    if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+        if (entry.is_separated) {
+            const u32 buffer_1 = entry.buffer;
+            const u32 buffer_2 = entry.secondary_buffer;
+            const u32 offset_1 = entry.offset;
+            const u32 offset_2 = entry.secondary_offset;
+            const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1);
+            const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2);
+            return engine.GetTextureInfo(handle_1 | handle_2);
+        }
+    }
     if (entry.is_bindless) {
         const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset);
         return engine.GetTextureInfo(tex_handle);
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
index e6f2fa553..616eacc36 100644
--- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
@@ -9,6 +9,8 @@
 #include "video_core/renderer_vulkan/wrapper.h"
 #include "video_core/textures/texture.h"
 
+using Tegra::Texture::TextureMipmapFilter;
+
 namespace Vulkan {
 
 namespace {
@@ -63,8 +65,8 @@ vk::Sampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) c
     ci.maxAnisotropy = tsc.GetMaxAnisotropy();
     ci.compareEnable = tsc.depth_compare_enabled;
     ci.compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func);
-    ci.minLod = tsc.GetMinLod();
-    ci.maxLod = tsc.GetMaxLod();
+    ci.minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.GetMinLod();
+    ci.maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.GetMaxLod();
     ci.borderColor = arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color);
     ci.unnormalizedCoordinates = VK_FALSE;
     return device.GetLogical().CreateSampler(ci);
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h
index dfddf7ad6..c765c60a0 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -35,7 +35,7 @@ public:
     /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
     void Unmap(u64 size);
 
-    VkBuffer GetHandle() const {
+    VkBuffer Handle() const {
         return *buffer;
     }
 
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 8f0bb996e..29ebf65ba 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -357,13 +357,11 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
     return pc;
 }
 
-ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset,
-                                               std::optional<u32> buffer) {
+ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(
+    SamplerInfo info, std::optional<Tegra::Engines::SamplerDescriptor> sampler) {
     if (info.IsComplete()) {
         return info;
     }
-    const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset)
-                                : registry.ObtainBoundSampler(offset);
     if (!sampler) {
         LOG_WARNING(HW_GPU, "Unknown sampler info");
         info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D);
@@ -381,8 +379,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset,
 
 std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler,
                                             SamplerInfo sampler_info) {
-    const auto offset = static_cast<u32>(sampler.index.Value());
-    const auto info = GetSamplerInfo(sampler_info, offset);
+    const u32 offset = static_cast<u32>(sampler.index.Value());
+    const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset));
 
     // If this sampler has already been used, return the existing mapping.
     const auto it = std::find_if(used_samplers.begin(), used_samplers.end(),
@@ -404,20 +402,19 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
     const Node sampler_register = GetRegister(reg);
     const auto [base_node, tracked_sampler_info] =
         TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size()));
-    ASSERT(base_node != nullptr);
-    if (base_node == nullptr) {
+    if (!base_node) {
+        UNREACHABLE();
         return std::nullopt;
     }
 
-    if (const auto bindless_sampler_info =
-            std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) {
-        const u32 buffer = bindless_sampler_info->GetIndex();
-        const u32 offset = bindless_sampler_info->GetOffset();
-        info = GetSamplerInfo(info, offset, buffer);
+    if (const auto sampler_info = std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) {
+        const u32 buffer = sampler_info->index;
+        const u32 offset = sampler_info->offset;
+        info = GetSamplerInfo(info, registry.ObtainBindlessSampler(buffer, offset));
 
         // If this sampler has already been used, return the existing mapping.
         const auto it = std::find_if(used_samplers.begin(), used_samplers.end(),
-                                     [buffer = buffer, offset = offset](const Sampler& entry) {
+                                     [buffer, offset](const Sampler& entry) {
                                          return entry.buffer == buffer && entry.offset == offset;
                                      });
         if (it != used_samplers.end()) {
@@ -431,10 +428,32 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
         return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array,
                                           *info.is_shadow, *info.is_buffer, false);
     }
-    if (const auto array_sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) {
-        const u32 base_offset = array_sampler_info->GetBaseOffset() / 4;
-        index_var = GetCustomVariable(array_sampler_info->GetIndexVar());
-        info = GetSamplerInfo(info, base_offset);
+    if (const auto sampler_info = std::get_if<SeparateSamplerNode>(&*tracked_sampler_info)) {
+        const std::pair indices = sampler_info->indices;
+        const std::pair offsets = sampler_info->offsets;
+        info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets));
+
+        // Try to use an already created sampler if it exists
+        const auto it = std::find_if(
+            used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) {
+                return offsets == std::pair{entry.offset, entry.secondary_offset} &&
+                       indices == std::pair{entry.buffer, entry.secondary_buffer};
+            });
+        if (it != used_samplers.end()) {
+            ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array &&
+                   it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer);
+            return *it;
+        }
+
+        // Otherwise create a new mapping for this sampler
+        const u32 next_index = static_cast<u32>(used_samplers.size());
+        return used_samplers.emplace_back(next_index, offsets, indices, *info.type, *info.is_array,
+                                          *info.is_shadow, *info.is_buffer);
+    }
+    if (const auto sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) {
+        const u32 base_offset = sampler_info->base_offset / 4;
+        index_var = GetCustomVariable(sampler_info->bindless_var);
+        info = GetSamplerInfo(info, registry.ObtainBoundSampler(base_offset));
 
         // If this sampler has already been used, return the existing mapping.
         const auto it = std::find_if(
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index c5e5165ff..8f230d57a 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -275,10 +275,11 @@ using Node = std::shared_ptr<NodeData>;
 using Node4 = std::array<Node, 4>;
 using NodeBlock = std::vector<Node>;
 
-class BindlessSamplerNode;
-class ArraySamplerNode;
+struct ArraySamplerNode;
+struct BindlessSamplerNode;
+struct SeparateSamplerNode;
 
-using TrackSamplerData = std::variant<BindlessSamplerNode, ArraySamplerNode>;
+using TrackSamplerData = std::variant<BindlessSamplerNode, SeparateSamplerNode, ArraySamplerNode>;
 using TrackSampler = std::shared_ptr<TrackSamplerData>;
 
 struct Sampler {
@@ -288,63 +289,51 @@ struct Sampler {
         : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow},
           is_buffer{is_buffer}, is_indexed{is_indexed} {}
 
+    /// Separate sampler constructor
+    constexpr explicit Sampler(u32 index, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers,
+                               Tegra::Shader::TextureType type, bool is_array, bool is_shadow,
+                               bool is_buffer)
+        : index{index}, offset{offsets.first}, secondary_offset{offsets.second},
+          buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array},
+          is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {}
+
     /// Bindless samplers constructor
     constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type,
                                bool is_array, bool is_shadow, bool is_buffer, bool is_indexed)
         : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array},
           is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {}
 
-    u32 index = 0;  ///< Emulated index given for the this sampler.
-    u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read.
-    u32 buffer = 0; ///< Buffer where the bindless sampler is being read (unused on bound samplers).
-    u32 size = 1;   ///< Size of the sampler.
+    u32 index = 0;            ///< Emulated index given for the this sampler.
+    u32 offset = 0;           ///< Offset in the const buffer from where the sampler is being read.
+    u32 secondary_offset = 0; ///< Secondary offset in the const buffer.
+    u32 buffer = 0;           ///< Buffer where the bindless sampler is read.
+    u32 secondary_buffer = 0; ///< Secondary buffer where the bindless sampler is read.
+    u32 size = 1;             ///< Size of the sampler.
 
     Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
-    bool is_array = false;    ///< Whether the texture is being sampled as an array texture or not.
-    bool is_shadow = false;   ///< Whether the texture is being sampled as a depth texture or not.
-    bool is_buffer = false;   ///< Whether the texture is a texture buffer without sampler.
-    bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not.
-    bool is_indexed = false;  ///< Whether this sampler is an indexed array of textures.
+    bool is_array = false;     ///< Whether the texture is being sampled as an array texture or not.
+    bool is_shadow = false;    ///< Whether the texture is being sampled as a depth texture or not.
+    bool is_buffer = false;    ///< Whether the texture is a texture buffer without sampler.
+    bool is_bindless = false;  ///< Whether this sampler belongs to a bindless texture or not.
+    bool is_indexed = false;   ///< Whether this sampler is an indexed array of textures.
+    bool is_separated = false; ///< Whether the image and sampler is separated or not.
 };
 
 /// Represents a tracked bindless sampler into a direct const buffer
-class ArraySamplerNode final {
-public:
-    explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var)
-        : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {}
-
-    constexpr u32 GetIndex() const {
-        return index;
-    }
-
-    constexpr u32 GetBaseOffset() const {
-        return base_offset;
-    }
-
-    constexpr u32 GetIndexVar() const {
-        return bindless_var;
-    }
-
-private:
+struct ArraySamplerNode {
     u32 index;
     u32 base_offset;
     u32 bindless_var;
 };
 
-/// Represents a tracked bindless sampler into a direct const buffer
-class BindlessSamplerNode final {
-public:
-    explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {}
-
-    constexpr u32 GetIndex() const {
-        return index;
-    }
-
-    constexpr u32 GetOffset() const {
-        return offset;
-    }
+/// Represents a tracked separate sampler image pair that was folded statically
+struct SeparateSamplerNode {
+    std::pair<u32, u32> indices;
+    std::pair<u32, u32> offsets;
+};
 
-private:
+/// Represents a tracked bindless sampler into a direct const buffer
+struct BindlessSamplerNode {
     u32 index;
     u32 offset;
 };
diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h
index 11231bbea..1e0886185 100644
--- a/src/video_core/shader/node_helper.h
+++ b/src/video_core/shader/node_helper.h
@@ -48,7 +48,7 @@ Node MakeNode(Args&&... args) {
 template <typename T, typename... Args>
 TrackSampler MakeTrackSampler(Args&&... args) {
     static_assert(std::is_convertible_v<T, TrackSamplerData>);
-    return std::make_shared<TrackSamplerData>(T(std::forward<Args>(args)...));
+    return std::make_shared<TrackSamplerData>(T{std::forward<Args>(args)...});
 }
 
 template <typename... Args>
diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp
index af70b3f35..cdf274e54 100644
--- a/src/video_core/shader/registry.cpp
+++ b/src/video_core/shader/registry.cpp
@@ -93,6 +93,26 @@ std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) {
     return value;
 }
 
+std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler(
+    std::pair<u32, u32> buffers, std::pair<u32, u32> offsets) {
+    SeparateSamplerKey key;
+    key.buffers = buffers;
+    key.offsets = offsets;
+    const auto iter = separate_samplers.find(key);
+    if (iter != separate_samplers.end()) {
+        return iter->second;
+    }
+    if (!engine) {
+        return std::nullopt;
+    }
+
+    const u32 handle_1 = engine->AccessConstBuffer32(stage, key.buffers.first, key.offsets.first);
+    const u32 handle_2 = engine->AccessConstBuffer32(stage, key.buffers.second, key.offsets.second);
+    const SamplerDescriptor value = engine->AccessSampler(handle_1 | handle_2);
+    separate_samplers.emplace(key, value);
+    return value;
+}
+
 std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer,
                                                                                  u32 offset) {
     const std::pair key = {buffer, offset};
diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h
index 0c80d35fd..231206765 100644
--- a/src/video_core/shader/registry.h
+++ b/src/video_core/shader/registry.h
@@ -19,8 +19,39 @@
 
 namespace VideoCommon::Shader {
 
+struct SeparateSamplerKey {
+    std::pair<u32, u32> buffers;
+    std::pair<u32, u32> offsets;
+};
+
+} // namespace VideoCommon::Shader
+
+namespace std {
+
+template <>
+struct hash<VideoCommon::Shader::SeparateSamplerKey> {
+    std::size_t operator()(const VideoCommon::Shader::SeparateSamplerKey& key) const noexcept {
+        return std::hash<u32>{}(key.buffers.first ^ key.buffers.second ^ key.offsets.first ^
+                                key.offsets.second);
+    }
+};
+
+template <>
+struct equal_to<VideoCommon::Shader::SeparateSamplerKey> {
+    bool operator()(const VideoCommon::Shader::SeparateSamplerKey& lhs,
+                    const VideoCommon::Shader::SeparateSamplerKey& rhs) const noexcept {
+        return lhs.buffers == rhs.buffers && lhs.offsets == rhs.offsets;
+    }
+};
+
+} // namespace std
+
+namespace VideoCommon::Shader {
+
 using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;
 using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>;
+using SeparateSamplerMap =
+    std::unordered_map<SeparateSamplerKey, Tegra::Engines::SamplerDescriptor>;
 using BindlessSamplerMap =
     std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>;
 
@@ -73,6 +104,9 @@ public:
 
     std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset);
 
+    std::optional<Tegra::Engines::SamplerDescriptor> ObtainSeparateSampler(
+        std::pair<u32, u32> buffers, std::pair<u32, u32> offsets);
+
     std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
 
     /// Inserts a key.
@@ -128,6 +162,7 @@ private:
     Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
     KeyMap keys;
     BoundSamplerMap bound_samplers;
+    SeparateSamplerMap separate_samplers;
     BindlessSamplerMap bindless_samplers;
     u32 bound_buffer;
     GraphicsInfo graphics_info;
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 15ae152f2..3a98b2104 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -330,8 +330,8 @@ private:
     OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation);
 
     /// Queries the missing sampler info from the execution context.
-    SamplerInfo GetSamplerInfo(SamplerInfo info, u32 offset,
-                               std::optional<u32> buffer = std::nullopt);
+    SamplerInfo GetSamplerInfo(SamplerInfo info,
+                               std::optional<Tegra::Engines::SamplerDescriptor> sampler);
 
     /// Accesses a texture sampler.
     std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info);
@@ -409,8 +409,14 @@ private:
 
     std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
 
-    std::tuple<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code,
-                                                        s64 cursor);
+    std::pair<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code,
+                                                       s64 cursor);
+
+    std::pair<Node, TrackSampler> HandleBindlessIndirectRead(const CbufNode& cbuf,
+                                                             const OperationNode& operation,
+                                                             Node gpr, Node base_offset,
+                                                             Node tracked, const NodeBlock& code,
+                                                             s64 cursor);
 
     std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;
 
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index eb97bfd41..d5ed81442 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -14,6 +14,7 @@
 namespace VideoCommon::Shader {
 
 namespace {
+
 std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
                                    OperationCode operation_code) {
     for (; cursor >= 0; --cursor) {
@@ -63,7 +64,8 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {
     if (const auto operation = std::get_if<OperationNode>(&*node)) {
         operation->SetAmendIndex(amend_index);
         return true;
-    } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+    }
+    if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
         conditional->SetAmendIndex(amend_index);
         return true;
     }
@@ -72,40 +74,27 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {
 
 } // Anonymous namespace
 
-std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code,
-                                                              s64 cursor) {
+std::pair<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code,
+                                                             s64 cursor) {
     if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
+        const u32 cbuf_index = cbuf->GetIndex();
+
         // Constant buffer found, test if it's an immediate
         const auto& offset = cbuf->GetOffset();
         if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
-            auto track =
-                MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue());
+            auto track = MakeTrackSampler<BindlessSamplerNode>(cbuf_index, immediate->GetValue());
             return {tracked, track};
         }
         if (const auto operation = std::get_if<OperationNode>(&*offset)) {
             const u32 bound_buffer = registry.GetBoundBuffer();
-            if (bound_buffer != cbuf->GetIndex()) {
+            if (bound_buffer != cbuf_index) {
                 return {};
             }
-            const auto pair = DecoupleIndirectRead(*operation);
-            if (!pair) {
-                return {};
+            if (const std::optional pair = DecoupleIndirectRead(*operation)) {
+                auto [gpr, base_offset] = *pair;
+                return HandleBindlessIndirectRead(*cbuf, *operation, gpr, base_offset, tracked,
+                                                  code, cursor);
             }
-            auto [gpr, base_offset] = *pair;
-            const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset);
-            const auto& gpu_driver = registry.AccessGuestDriverProfile();
-            const u32 bindless_cv = NewCustomVariable();
-            Node op =
-                Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize()));
-
-            const Node cv_node = GetCustomVariable(bindless_cv);
-            Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op));
-            const std::size_t amend_index = DeclareAmend(std::move(amend_op));
-            AmendNodeCv(amend_index, code[cursor]);
-            // TODO Implement Bindless Index custom variable
-            auto track = MakeTrackSampler<ArraySamplerNode>(cbuf->GetIndex(),
-                                                            offset_inm->GetValue(), bindless_cv);
-            return {tracked, track};
         }
         return {};
     }
@@ -122,10 +111,23 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
         return TrackBindlessSampler(source, code, new_cursor);
     }
     if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
-        for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
-            if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor);
-                std::get<0>(found)) {
-                // Cbuf found in operand.
+        const OperationNode& op = *operation;
+
+        const OperationCode opcode = operation->GetCode();
+        if (opcode == OperationCode::IBitwiseOr || opcode == OperationCode::UBitwiseOr) {
+            ASSERT(op.GetOperandsCount() == 2);
+            auto [node_a, index_a, offset_a] = TrackCbuf(op[0], code, cursor);
+            auto [node_b, index_b, offset_b] = TrackCbuf(op[1], code, cursor);
+            if (node_a && node_b) {
+                auto track = MakeTrackSampler<SeparateSamplerNode>(std::pair{index_a, index_b},
+                                                                   std::pair{offset_a, offset_b});
+                return {tracked, std::move(track)};
+            }
+        }
+        std::size_t i = op.GetOperandsCount();
+        while (i--) {
+            if (auto found = TrackBindlessSampler(op[i - 1], code, cursor); std::get<0>(found)) {
+                // Constant buffer found in operand.
                 return found;
             }
         }
@@ -139,6 +141,26 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
     return {};
 }
 
+std::pair<Node, TrackSampler> ShaderIR::HandleBindlessIndirectRead(
+    const CbufNode& cbuf, const OperationNode& operation, Node gpr, Node base_offset, Node tracked,
+    const NodeBlock& code, s64 cursor) {
+    const auto offset_imm = std::get<ImmediateNode>(*base_offset);
+    const auto& gpu_driver = registry.AccessGuestDriverProfile();
+    const u32 bindless_cv = NewCustomVariable();
+    const u32 texture_handler_size = gpu_driver.GetTextureHandlerSize();
+    Node op = Operation(OperationCode::UDiv, gpr, Immediate(texture_handler_size));
+
+    Node cv_node = GetCustomVariable(bindless_cv);
+    Node amend_op = Operation(OperationCode::Assign, std::move(cv_node), std::move(op));
+    const std::size_t amend_index = DeclareAmend(std::move(amend_op));
+    AmendNodeCv(amend_index, code[cursor]);
+
+    // TODO: Implement bindless index custom variable
+    auto track =
+        MakeTrackSampler<ArraySamplerNode>(cbuf.GetIndex(), offset_imm.GetValue(), bindless_cv);
+    return {tracked, track};
+}
+
 std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code,
                                                s64 cursor) const {
     if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index 7e9073cc3..32c81dc70 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -631,13 +631,11 @@ void Config::ReadRendererValues() {
         static_cast<Settings::RendererBackend>(ReadSetting(QStringLiteral("backend"), 0).toInt());
     Settings::values.renderer_debug = ReadSetting(QStringLiteral("debug"), false).toBool();
     Settings::values.vulkan_device = ReadSetting(QStringLiteral("vulkan_device"), 0).toInt();
-    Settings::values.resolution_factor =
-        ReadSetting(QStringLiteral("resolution_factor"), 1.0).toFloat();
     Settings::values.aspect_ratio = ReadSetting(QStringLiteral("aspect_ratio"), 0).toInt();
     Settings::values.max_anisotropy = ReadSetting(QStringLiteral("max_anisotropy"), 0).toInt();
     Settings::values.use_frame_limit =
         ReadSetting(QStringLiteral("use_frame_limit"), true).toBool();
-    Settings::values.frame_limit = ReadSetting(QStringLiteral("frame_limit"), 100).toInt();
+    Settings::values.frame_limit = ReadSetting(QStringLiteral("frame_limit"), 100).toUInt();
     Settings::values.use_disk_shader_cache =
         ReadSetting(QStringLiteral("use_disk_shader_cache"), true).toBool();
     const int gpu_accuracy_level = ReadSetting(QStringLiteral("gpu_accuracy"), 0).toInt();
@@ -722,8 +720,6 @@ void Config::ReadUIValues() {
             .toString();
     UISettings::values.enable_discord_presence =
         ReadSetting(QStringLiteral("enable_discord_presence"), true).toBool();
-    UISettings::values.screenshot_resolution_factor =
-        static_cast<u16>(ReadSetting(QStringLiteral("screenshot_resolution_factor"), 0).toUInt());
     UISettings::values.select_user_on_boot =
         ReadSetting(QStringLiteral("select_user_on_boot"), false).toBool();
 
@@ -1082,8 +1078,6 @@ void Config::SaveRendererValues() {
     WriteSetting(QStringLiteral("backend"), static_cast<int>(Settings::values.renderer_backend), 0);
     WriteSetting(QStringLiteral("debug"), Settings::values.renderer_debug, false);
     WriteSetting(QStringLiteral("vulkan_device"), Settings::values.vulkan_device, 0);
-    WriteSetting(QStringLiteral("resolution_factor"),
-                 static_cast<double>(Settings::values.resolution_factor), 1.0);
     WriteSetting(QStringLiteral("aspect_ratio"), Settings::values.aspect_ratio, 0);
     WriteSetting(QStringLiteral("max_anisotropy"), Settings::values.max_anisotropy, 0);
     WriteSetting(QStringLiteral("use_frame_limit"), Settings::values.use_frame_limit, true);
@@ -1159,8 +1153,6 @@ void Config::SaveUIValues() {
                  QString::fromUtf8(UISettings::themes[0].second));
     WriteSetting(QStringLiteral("enable_discord_presence"),
                  UISettings::values.enable_discord_presence, true);
-    WriteSetting(QStringLiteral("screenshot_resolution_factor"),
-                 UISettings::values.screenshot_resolution_factor, 0);
     WriteSetting(QStringLiteral("select_user_on_boot"), UISettings::values.select_user_on_boot,
                  false);
 
diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp
index ea667caef..304625cd7 100644
--- a/src/yuzu/configuration/configure_graphics.cpp
+++ b/src/yuzu/configuration/configure_graphics.cpp
@@ -19,47 +19,6 @@
 #include "video_core/renderer_vulkan/renderer_vulkan.h"
 #endif
 
-namespace {
-enum class Resolution : int {
-    Auto,
-    Scale1x,
-    Scale2x,
-    Scale3x,
-    Scale4x,
-};
-
-float ToResolutionFactor(Resolution option) {
-    switch (option) {
-    case Resolution::Auto:
-        return 0.f;
-    case Resolution::Scale1x:
-        return 1.f;
-    case Resolution::Scale2x:
-        return 2.f;
-    case Resolution::Scale3x:
-        return 3.f;
-    case Resolution::Scale4x:
-        return 4.f;
-    }
-    return 0.f;
-}
-
-Resolution FromResolutionFactor(float factor) {
-    if (factor == 0.f) {
-        return Resolution::Auto;
-    } else if (factor == 1.f) {
-        return Resolution::Scale1x;
-    } else if (factor == 2.f) {
-        return Resolution::Scale2x;
-    } else if (factor == 3.f) {
-        return Resolution::Scale3x;
-    } else if (factor == 4.f) {
-        return Resolution::Scale4x;
-    }
-    return Resolution::Auto;
-}
-} // Anonymous namespace
-
 ConfigureGraphics::ConfigureGraphics(QWidget* parent)
     : QWidget(parent), ui(new Ui::ConfigureGraphics) {
     vulkan_device = Settings::values.vulkan_device;
@@ -99,8 +58,6 @@ void ConfigureGraphics::SetConfiguration() {
 
     ui->api->setEnabled(runtime_lock);
     ui->api->setCurrentIndex(static_cast<int>(Settings::values.renderer_backend));
-    ui->resolution_factor_combobox->setCurrentIndex(
-        static_cast<int>(FromResolutionFactor(Settings::values.resolution_factor)));
     ui->aspect_ratio_combobox->setCurrentIndex(Settings::values.aspect_ratio);
     ui->use_disk_shader_cache->setEnabled(runtime_lock);
     ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache);
@@ -114,8 +71,6 @@ void ConfigureGraphics::SetConfiguration() {
 void ConfigureGraphics::ApplyConfiguration() {
     Settings::values.renderer_backend = GetCurrentGraphicsBackend();
     Settings::values.vulkan_device = vulkan_device;
-    Settings::values.resolution_factor =
-        ToResolutionFactor(static_cast<Resolution>(ui->resolution_factor_combobox->currentIndex()));
     Settings::values.aspect_ratio = ui->aspect_ratio_combobox->currentIndex();
     Settings::values.use_disk_shader_cache = ui->use_disk_shader_cache->isChecked();
     Settings::values.use_asynchronous_gpu_emulation =
diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui
index c816d6108..6e75447a5 100644
--- a/src/yuzu/configuration/configure_graphics.ui
+++ b/src/yuzu/configuration/configure_graphics.ui
@@ -85,46 +85,6 @@
          </widget>
         </item>
         <item>
-         <layout class="QHBoxLayout" name="horizontalLayout_2">
-          <item>
-           <widget class="QLabel" name="label">
-            <property name="text">
-             <string>Internal Resolution:</string>
-            </property>
-           </widget>
-          </item>
-          <item>
-           <widget class="QComboBox" name="resolution_factor_combobox">
-            <item>
-             <property name="text">
-              <string>Auto (Window Size)</string>
-             </property>
-            </item>
-            <item>
-             <property name="text">
-              <string>Native (1280x720)</string>
-             </property>
-            </item>
-            <item>
-             <property name="text">
-              <string>2x Native (2560x1440)</string>
-             </property>
-            </item>
-            <item>
-             <property name="text">
-              <string>3x Native (3840x2160)</string>
-             </property>
-            </item>
-            <item>
-             <property name="text">
-              <string>4x Native (5120x2880)</string>
-             </property>
-            </item>
-           </widget>
-          </item>
-         </layout>
-        </item>
-        <item>
          <layout class="QHBoxLayout" name="horizontalLayout_6">
           <item>
            <widget class="QLabel" name="ar_label">
diff --git a/src/yuzu/configuration/configure_graphics_advanced.cpp b/src/yuzu/configuration/configure_graphics_advanced.cpp
index 37aadf7f8..be5006ad3 100644
--- a/src/yuzu/configuration/configure_graphics_advanced.cpp
+++ b/src/yuzu/configuration/configure_graphics_advanced.cpp
@@ -12,9 +12,6 @@ ConfigureGraphicsAdvanced::ConfigureGraphicsAdvanced(QWidget* parent)
 
     ui->setupUi(this);
 
-    // TODO: Remove this after assembly shaders are fully integrated
-    ui->use_assembly_shaders->setVisible(false);
-
     SetConfiguration();
 }
 
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index 270cccc77..4119d7907 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -689,10 +689,7 @@ void GMainWindow::InitializeHotkeys() {
                 Settings::values.use_frame_limit = !Settings::values.use_frame_limit;
                 UpdateStatusBar();
             });
-    // TODO: Remove this comment/static whenever the next major release of
-    // MSVC occurs and we make it a requirement (see:
-    // https://developercommunity.visualstudio.com/content/problem/93922/constexprs-are-trying-to-be-captured-in-lambda-fun.html)
-    static constexpr u16 SPEED_LIMIT_STEP = 5;
+    constexpr u16 SPEED_LIMIT_STEP = 5;
     connect(hotkey_registry.GetHotkey(main_window, QStringLiteral("Increase Speed Limit"), this),
             &QShortcut::activated, this, [&] {
                 if (Settings::values.frame_limit < 9999 - SPEED_LIMIT_STEP) {
diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp
index 7240270f5..659b9f701 100644
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -380,8 +380,6 @@ void Config::ReadValues() {
     Settings::values.renderer_debug = sdl2_config->GetBoolean("Renderer", "debug", false);
     Settings::values.vulkan_device = sdl2_config->GetInteger("Renderer", "vulkan_device", 0);
 
-    Settings::values.resolution_factor =
-        static_cast<float>(sdl2_config->GetReal("Renderer", "resolution_factor", 1.0));
     Settings::values.aspect_ratio =
         static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0));
     Settings::values.max_anisotropy =
diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h
index 6f53e9659..45c07ed5d 100644
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -117,11 +117,6 @@ use_hw_renderer =
 # 0: Interpreter (slow), 1 (default): JIT (fast)
 use_shader_jit =
 
-# Resolution scale factor
-# 0: Auto (scales resolution to window size), 1: Native Switch screen resolution, Otherwise a scale
-# factor for the Switch resolution
-resolution_factor =
-
 # Aspect ratio
 # 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window
 aspect_ratio =
diff --git a/src/yuzu_tester/config.cpp b/src/yuzu_tester/config.cpp
index 3be58b15d..1566c2e3f 100644
--- a/src/yuzu_tester/config.cpp
+++ b/src/yuzu_tester/config.cpp
@@ -116,8 +116,6 @@ void Config::ReadValues() {
     Settings::values.use_multi_core = sdl2_config->GetBoolean("Core", "use_multi_core", false);
 
     // Renderer
-    Settings::values.resolution_factor =
-        static_cast<float>(sdl2_config->GetReal("Renderer", "resolution_factor", 1.0));
     Settings::values.aspect_ratio =
         static_cast<int>(sdl2_config->GetInteger("Renderer", "aspect_ratio", 0));
     Settings::values.max_anisotropy =
diff --git a/src/yuzu_tester/default_ini.h b/src/yuzu_tester/default_ini.h
index ca203b64d..41bbbbf60 100644
--- a/src/yuzu_tester/default_ini.h
+++ b/src/yuzu_tester/default_ini.h
@@ -21,11 +21,6 @@ use_hw_renderer =
 # 0: Interpreter (slow), 1 (default): JIT (fast)
 use_shader_jit =
 
-# Resolution scale factor
-# 0: Auto (scales resolution to window size), 1: Native Switch screen resolution, Otherwise a scale
-# factor for the Switch resolution
-resolution_factor =
-
 # Aspect ratio
 # 0: Default (16:9), 1: Force 4:3, 2: Force 21:9, 3: Stretch to Window
 aspect_ratio =