35 files changed, 2045 insertions, 221 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index d7f7d336c..b03a30992 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -28,6 +28,10 @@ add_library(video_core STATIC
     dirty_flags.h
     dma_pusher.cpp
     dma_pusher.h
+    engines/sw_blitter/blitter.cpp
+    engines/sw_blitter/blitter.h
+    engines/sw_blitter/converter.cpp
+    engines/sw_blitter/converter.h
     engines/const_buffer_info.h
     engines/engine_interface.h
     engines/engine_upload.cpp
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 599551013..5d3a8293b 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -1742,12 +1742,12 @@ bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,
     SynchronizeBuffer(buffer, dest_address, static_cast<u32>(copy_size));
 
     if constexpr (USE_MEMORY_MAPS) {
+        auto upload_staging = runtime.UploadStagingBuffer(copy_size);
         std::array copies{BufferCopy{
-            .src_offset = 0,
+            .src_offset = upload_staging.offset,
             .dst_offset = buffer.Offset(dest_address),
             .size = copy_size,
         }};
-        auto upload_staging = runtime.UploadStagingBuffer(copy_size);
         u8* const src_pointer = upload_staging.mapped_span.data();
         std::memcpy(src_pointer, inlined_buffer.data(), copy_size);
         runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
diff --git a/src/video_core/control/channel_state.cpp b/src/video_core/control/channel_state.cpp
index cdecc3a91..832025d75 100644
--- a/src/video_core/control/channel_state.cpp
+++ b/src/video_core/control/channel_state.cpp
@@ -20,7 +20,7 @@ void ChannelState::Init(Core::System& system, GPU& gpu) {
     ASSERT(memory_manager);
     dma_pusher = std::make_unique<Tegra::DmaPusher>(system, gpu, *memory_manager, *this);
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, *memory_manager);
-    fermi_2d = std::make_unique<Engines::Fermi2D>();
+    fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
     kepler_compute = std::make_unique<Engines::KeplerCompute>(system, *memory_manager);
     maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, *memory_manager);
     kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
index a34819234..e4f8331ab 100644
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -49,13 +49,12 @@ void State::ProcessData(std::span<const u8> read_buffer) {
         if (regs.line_count == 1) {
             rasterizer->AccelerateInlineToMemory(address, copy_size, read_buffer);
         } else {
-            for (u32 line = 0; line < regs.line_count; ++line) {
-                const GPUVAddr dest_line = address + static_cast<size_t>(line) * regs.dest.pitch;
-                memory_manager.WriteBlockUnsafe(
-                    dest_line, read_buffer.data() + static_cast<size_t>(line) * regs.line_length_in,
-                    regs.line_length_in);
+            for (size_t line = 0; line < regs.line_count; ++line) {
+                const GPUVAddr dest_line = address + line * regs.dest.pitch;
+                std::span<const u8> buffer(read_buffer.data() + line * regs.line_length_in,
+                                           regs.line_length_in);
+                rasterizer->AccelerateInlineToMemory(dest_line, regs.line_length_in, buffer);
             }
-            memory_manager.InvalidateRegion(address, regs.dest.pitch * regs.line_count);
         }
     } else {
         u32 width = regs.dest.width;
diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h
index f08f6e36a..94fafd9dc 100644
--- a/src/video_core/engines/engine_upload.h
+++ b/src/video_core/engines/engine_upload.h
@@ -39,7 +39,7 @@ struct Registers {
         u32 y;
 
         GPUVAddr Address() const {
-            return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low);
+            return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
         }
 
         u32 BlockWidth() const {
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 453e0fb01..c6478ae85 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -3,17 +3,25 @@
 
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "video_core/engines/fermi_2d.h"
-#include "video_core/memory_manager.h"
+#include "video_core/engines/sw_blitter/blitter.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/surface.h"
+#include "video_core/textures/decoders.h"
+
+MICROPROFILE_DECLARE(GPU_BlitEngine);
+MICROPROFILE_DEFINE(GPU_BlitEngine, "GPU", "Blit Engine", MP_RGB(224, 224, 128));
 
 using VideoCore::Surface::BytesPerBlock;
 using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
 
 namespace Tegra::Engines {
 
-Fermi2D::Fermi2D() {
+using namespace Texture;
+
+Fermi2D::Fermi2D(MemoryManager& memory_manager_) {
+    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager_);
     // Nvidia's OpenGL driver seems to assume these values
     regs.src.depth = 1;
     regs.dst.depth = 1;
@@ -42,6 +50,7 @@ void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32
 }
 
 void Fermi2D::Blit() {
+    MICROPROFILE_SCOPE(GPU_BlitEngine);
     LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
               regs.src.Address(), regs.dst.Address());
 
@@ -52,9 +61,16 @@ void Fermi2D::Blit() {
     UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled");
 
     const auto& args = regs.pixels_from_memory;
+    constexpr s64 null_derivate = 1ULL << 32;
+    Surface src = regs.src;
+    const auto bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
+    const bool delegate_to_gpu = src.width > 512 && src.height > 512 && bytes_per_pixel <= 8 &&
+                                 src.format != regs.dst.format;
     Config config{
         .operation = regs.operation,
         .filter = args.sample_mode.filter,
+        .must_accelerate =
+            args.du_dx != null_derivate || args.dv_dy != null_derivate || delegate_to_gpu,
         .dst_x0 = args.dst_x0,
         .dst_y0 = args.dst_y0,
         .dst_x1 = args.dst_x0 + args.dst_width,
@@ -64,8 +80,7 @@ void Fermi2D::Blit() {
         .src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32),
         .src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32),
     };
-    Surface src = regs.src;
-    const auto bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
+
     const auto need_align_to_pitch =
         src.linear == Tegra::Engines::Fermi2D::MemoryLayout::Pitch &&
         static_cast<s32>(src.width) == config.src_x1 &&
@@ -78,8 +93,9 @@ void Fermi2D::Blit() {
         config.src_x1 -= config.src_x0;
         config.src_x0 = 0;
     }
+
     if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {
-        UNIMPLEMENTED();
+        sw_blitter->Blit(src, regs.dst, config);
     }
 }
 
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 1229aa35b..100b21bac 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -5,6 +5,7 @@
 
 #include <array>
 #include <cstddef>
+#include <memory>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
@@ -21,6 +22,10 @@ class RasterizerInterface;
 
 namespace Tegra::Engines {
 
+namespace Blitter {
+class SoftwareBlitEngine;
+}
+
 /**
  * This Engine is known as G80_2D. Documentation can be found in:
  * https://github.com/envytools/envytools/blob/master/rnndb/graph/g80_2d.xml
@@ -32,7 +37,7 @@ namespace Tegra::Engines {
 
 class Fermi2D final : public EngineInterface {
 public:
-    explicit Fermi2D();
+    explicit Fermi2D(MemoryManager& memory_manager_);
     ~Fermi2D() override;
 
     /// Binds a rasterizer to this engine.
@@ -92,7 +97,7 @@ public:
         u32 addr_lower;
 
         [[nodiscard]] constexpr GPUVAddr Address() const noexcept {
-            return (static_cast<GPUVAddr>(addr_upper) << 32) | static_cast<GPUVAddr>(addr_lower);
+            return (GPUVAddr{addr_upper} << 32) | GPUVAddr{addr_lower};
         }
     };
     static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
@@ -286,6 +291,7 @@ public:
     struct Config {
         Operation operation;
         Filter filter;
+        bool must_accelerate;
         s32 dst_x0;
         s32 dst_y0;
         s32 dst_x1;
@@ -298,6 +304,7 @@ public:
 
 private:
     VideoCore::RasterizerInterface* rasterizer = nullptr;
+    std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter;
 
     /// Performs the copy from the source surface to the destination surface as configured in the
     /// registers.
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index 7c50bdbe0..e5c622155 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -50,11 +50,11 @@ void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amoun
                                     u32 methods_pending) {
     switch (method) {
     case KEPLER_COMPUTE_REG_INDEX(data_upload):
-        upload_state.ProcessData(base_start, static_cast<size_t>(amount));
+        upload_state.ProcessData(base_start, amount);
         return;
     default:
-        for (std::size_t i = 0; i < amount; i++) {
-            CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+        for (u32 i = 0; i < amount; i++) {
+            CallMethod(method, base_start[i], methods_pending - i <= 1);
         }
         break;
     }
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index aab309ecc..e154e3f06 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -68,7 +68,7 @@ public:
                 struct {
                     u32 address;
                     GPUVAddr Address() const {
-                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address) << 8));
+                        return GPUVAddr{address} << 8;
                     }
                 } launch_desc_loc;
 
@@ -83,8 +83,7 @@ public:
                     u32 address_low;
                     u32 limit;
                     GPUVAddr Address() const {
-                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                                     address_low);
+                        return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
                     }
                 } tsc;
 
@@ -95,8 +94,7 @@ public:
                     u32 address_low;
                     u32 limit;
                     GPUVAddr Address() const {
-                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                                     address_low);
+                        return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
                     }
                 } tic;
 
@@ -106,8 +104,7 @@ public:
                     u32 address_high;
                     u32 address_low;
                     GPUVAddr Address() const {
-                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                                     address_low);
+                        return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
                     }
                 } code_loc;
 
@@ -162,8 +159,7 @@ public:
                 BitField<15, 17, u32> size;
             };
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high.Value()) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high.Value()} << 32) | GPUVAddr{address_low};
             }
         };
         std::array<ConstBufferConfig, NumConstBuffers> const_buffer_config;
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index a3fbab1e5..08045d1cf 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -42,11 +42,11 @@ void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount
                                    u32 methods_pending) {
     switch (method) {
     case KEPLERMEMORY_REG_INDEX(data):
-        upload_state.ProcessData(base_start, static_cast<size_t>(amount));
+        upload_state.ProcessData(base_start, amount);
         return;
     default:
-        for (std::size_t i = 0; i < amount; i++) {
-            CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+        for (u32 i = 0; i < amount; i++) {
+            CallMethod(method, base_start[i], methods_pending - i <= 1);
         }
         break;
     }
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 5bb1427c1..55462752c 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -126,6 +126,7 @@ void Maxwell3D::InitializeRegisterDefaults() {
     draw_command[MAXWELL3D_REG_INDEX(draw_inline_index)] = true;
     draw_command[MAXWELL3D_REG_INDEX(inline_index_2x16.even)] = true;
     draw_command[MAXWELL3D_REG_INDEX(inline_index_4x8.index0)] = true;
+    draw_command[MAXWELL3D_REG_INDEX(draw.instance_id)] = true;
 }
 
 void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
@@ -249,9 +250,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
         return;
     case MAXWELL3D_REG_INDEX(fragment_barrier):
         return rasterizer->FragmentBarrier();
-    case MAXWELL3D_REG_INDEX(invalidate_texture_data_cache):
-        rasterizer->InvalidateGPUCache();
-        return rasterizer->WaitForIdle();
     case MAXWELL3D_REG_INDEX(tiled_cache_barrier):
         return rasterizer->TiledCacheBarrier();
     }
@@ -288,31 +286,58 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
     ASSERT_MSG(method < Regs::NUM_REGS,
                "Invalid Maxwell3D register, increase the size of the Regs structure");
 
+    const u32 argument = ProcessShadowRam(method, method_argument);
+    ProcessDirtyRegisters(method, argument);
+
     if (draw_command[method]) {
         regs.reg_array[method] = method_argument;
         deferred_draw_method.push_back(method);
-        auto u32_to_u8 = [&](const u32 argument) {
-            inline_index_draw_indexes.push_back(static_cast<u8>(argument & 0x000000ff));
-            inline_index_draw_indexes.push_back(static_cast<u8>((argument & 0x0000ff00) >> 8));
-            inline_index_draw_indexes.push_back(static_cast<u8>((argument & 0x00ff0000) >> 16));
-            inline_index_draw_indexes.push_back(static_cast<u8>((argument & 0xff000000) >> 24));
+        auto update_inline_index = [&](const u32 index) {
+            inline_index_draw_indexes.push_back(static_cast<u8>(index & 0x000000ff));
+            inline_index_draw_indexes.push_back(static_cast<u8>((index & 0x0000ff00) >> 8));
+            inline_index_draw_indexes.push_back(static_cast<u8>((index & 0x00ff0000) >> 16));
+            inline_index_draw_indexes.push_back(static_cast<u8>((index & 0xff000000) >> 24));
+            draw_mode = DrawMode::InlineIndex;
         };
-        if (MAXWELL3D_REG_INDEX(draw_inline_index) == method) {
-            u32_to_u8(method_argument);
-        } else if (MAXWELL3D_REG_INDEX(inline_index_2x16.even) == method) {
-            u32_to_u8(regs.inline_index_2x16.even);
-            u32_to_u8(regs.inline_index_2x16.odd);
-        } else if (MAXWELL3D_REG_INDEX(inline_index_4x8.index0) == method) {
-            u32_to_u8(regs.inline_index_4x8.index0);
-            u32_to_u8(regs.inline_index_4x8.index1);
-            u32_to_u8(regs.inline_index_4x8.index2);
-            u32_to_u8(regs.inline_index_4x8.index3);
+        switch (method) {
+        case MAXWELL3D_REG_INDEX(draw.end):
+            switch (draw_mode) {
+            case DrawMode::General:
+                ProcessDraw(1);
+                break;
+            case DrawMode::InlineIndex:
+                regs.index_buffer.count = static_cast<u32>(inline_index_draw_indexes.size() / 4);
+                regs.index_buffer.format = Regs::IndexFormat::UnsignedInt;
+                ProcessDraw(1);
+                inline_index_draw_indexes.clear();
+                break;
+            case DrawMode::Instance:
+                break;
+            }
+            break;
+        case MAXWELL3D_REG_INDEX(draw_inline_index):
+            update_inline_index(method_argument);
+            break;
+        case MAXWELL3D_REG_INDEX(inline_index_2x16.even):
+            update_inline_index(regs.inline_index_2x16.even);
+            update_inline_index(regs.inline_index_2x16.odd);
+            break;
+        case MAXWELL3D_REG_INDEX(inline_index_4x8.index0):
+            update_inline_index(regs.inline_index_4x8.index0);
+            update_inline_index(regs.inline_index_4x8.index1);
+            update_inline_index(regs.inline_index_4x8.index2);
+            update_inline_index(regs.inline_index_4x8.index3);
+            break;
+        case MAXWELL3D_REG_INDEX(draw.instance_id):
+            draw_mode =
+                (regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::Subsequent) ||
+                        (regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::Unchanged)
+                    ? DrawMode::Instance
+                    : DrawMode::General;
+            break;
         }
     } else {
         ProcessDeferredDraw();
-
-        const u32 argument = ProcessShadowRam(method, method_argument);
-        ProcessDirtyRegisters(method, argument);
         ProcessMethodCall(method, argument, method_argument, is_last_call);
     }
 }
@@ -345,11 +370,11 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
         ProcessCBMultiData(base_start, amount);
         break;
     case MAXWELL3D_REG_INDEX(inline_data):
-        upload_state.ProcessData(base_start, static_cast<size_t>(amount));
+        upload_state.ProcessData(base_start, amount);
         return;
     default:
-        for (std::size_t i = 0; i < amount; i++) {
-            CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+        for (u32 i = 0; i < amount; i++) {
+            CallMethod(method, base_start[i], methods_pending - i <= 1);
         }
         break;
     }
@@ -511,10 +536,7 @@ void Maxwell3D::ProcessCounterReset() {
 
 void Maxwell3D::ProcessSyncPoint() {
     const u32 sync_point = regs.sync_info.sync_point.Value();
-    const u32 cache_flush = regs.sync_info.clean_l2.Value();
-    if (cache_flush != 0) {
-        rasterizer->InvalidateGPUCache();
-    }
+    [[maybe_unused]] const u32 cache_flush = regs.sync_info.clean_l2.Value();
     rasterizer->SignalSyncPoint(sync_point);
 }
 
@@ -626,57 +648,27 @@ void Maxwell3D::ProcessDraw(u32 instance_count) {
 }
 
 void Maxwell3D::ProcessDeferredDraw() {
-    if (deferred_draw_method.empty()) {
+    if (draw_mode != DrawMode::Instance || deferred_draw_method.empty()) {
         return;
     }
 
-    enum class DrawMode {
-        Undefined,
-        General,
-        Instance,
-    };
-    DrawMode draw_mode{DrawMode::Undefined};
-    u32 method_count = static_cast<u32>(deferred_draw_method.size());
-    u32 method = deferred_draw_method[method_count - 1];
-    if (MAXWELL3D_REG_INDEX(draw.end) != method) {
-        return;
-    }
-    draw_mode = (regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::Subsequent) ||
-                        (regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::Unchanged)
-                    ? DrawMode::Instance
-                    : DrawMode::General;
-    u32 instance_count = 0;
-    if (draw_mode == DrawMode::Instance) {
-        u32 vertex_buffer_count = 0;
-        u32 index_buffer_count = 0;
-        for (u32 index = 0; index < method_count; ++index) {
-            method = deferred_draw_method[index];
-            if (method == MAXWELL3D_REG_INDEX(vertex_buffer.count)) {
-                instance_count = ++vertex_buffer_count;
-            } else if (method == MAXWELL3D_REG_INDEX(index_buffer.count)) {
-                instance_count = ++index_buffer_count;
-            }
-        }
-        ASSERT_MSG(!(vertex_buffer_count && index_buffer_count),
-                   "Instance both indexed and direct?");
-    } else {
-        instance_count = 1;
-        for (u32 index = 0; index < method_count; ++index) {
-            method = deferred_draw_method[index];
-            if (MAXWELL3D_REG_INDEX(draw_inline_index) == method ||
-                MAXWELL3D_REG_INDEX(inline_index_2x16.even) == method ||
-                MAXWELL3D_REG_INDEX(inline_index_4x8.index0) == method) {
-                regs.index_buffer.count = static_cast<u32>(inline_index_draw_indexes.size() / 4);
-                regs.index_buffer.format = Regs::IndexFormat::UnsignedInt;
-                break;
-            }
+    const auto method_count = deferred_draw_method.size();
+    u32 instance_count = 1;
+    u32 vertex_buffer_count = 0;
+    u32 index_buffer_count = 0;
+    for (size_t index = 0; index < method_count; ++index) {
+        const u32 method = deferred_draw_method[index];
+        if (method == MAXWELL3D_REG_INDEX(vertex_buffer.count)) {
+            instance_count = ++vertex_buffer_count;
+        } else if (method == MAXWELL3D_REG_INDEX(index_buffer.count)) {
+            instance_count = ++index_buffer_count;
         }
     }
+    ASSERT_MSG(!(vertex_buffer_count && index_buffer_count), "Instance both indexed and direct?");
 
     ProcessDraw(instance_count);
 
     deferred_draw_method.clear();
-    inline_index_draw_indexes.clear();
 }
 
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index c3099f9a6..deba292a5 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -96,8 +96,7 @@ public:
             u32 type;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
 
@@ -106,8 +105,7 @@ public:
             u32 address_low;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
 
@@ -124,8 +122,7 @@ public:
             Mode mode;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(offset_high) << 32) |
-                                             offset_low);
+                return (GPUVAddr{offset_high} << 32) | GPUVAddr{offset_low};
             }
         };
 
@@ -187,7 +184,7 @@ public:
                 default:
                     // Thresholds begin at 0x10 (1 << 4)
                     // Threshold is in the range 0x1 to 0x13
-                    return 1 << (4 + threshold.Value() - 1);
+                    return 1U << (4 + threshold.Value() - 1);
                 }
             }
         };
@@ -468,8 +465,7 @@ public:
                 INSERT_PADDING_BYTES_NOINIT(0xC);
 
                 GPUVAddr Address() const {
-                    return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                                 address_low);
+                    return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
                 }
             };
             static_assert(sizeof(Buffer) == 0x20);
@@ -511,12 +507,11 @@ public:
             u32 default_size_per_warp;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
 
             u64 Size() const {
-                return (static_cast<u64>(size_high) << 32) | size_low;
+                return (u64{size_high} << 32) | u64{size_low};
             }
         };
 
@@ -538,13 +533,11 @@ public:
             u32 storage_limit_address_low;
 
             GPUVAddr StorageAddress() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(storage_address_high) << 32) |
-                                             storage_address_low);
+                return (GPUVAddr{storage_address_high} << 32) | GPUVAddr{storage_address_low};
             }
             GPUVAddr StorageLimitAddress() const {
-                return static_cast<GPUVAddr>(
-                    (static_cast<GPUVAddr>(storage_limit_address_high) << 32) |
-                    storage_limit_address_low);
+                return (GPUVAddr{storage_limit_address_high} << 32) |
+                       GPUVAddr{storage_limit_address_low};
             }
         };
 
@@ -829,11 +822,11 @@ public:
         struct CompressionThresholdSamples {
             u32 samples;
 
-            u32 Samples() {
+            u32 Samples() const {
                 if (samples == 0) {
                     return 0;
                 }
-                return 1 << (samples - 1);
+                return 1U << (samples - 1);
             }
         };
 
@@ -1138,8 +1131,7 @@ public:
             INSERT_PADDING_BYTES_NOINIT(0x18);
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
         static_assert(sizeof(RenderTargetConfig) == 0x40);
@@ -1482,8 +1474,7 @@ public:
             u32 address_low;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
 
@@ -1533,8 +1524,7 @@ public:
             u32 address_low;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
 
@@ -1561,8 +1551,7 @@ public:
             u32 array_pitch;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
 
@@ -1910,8 +1899,7 @@ public:
             Mode mode;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
 
@@ -1921,8 +1909,7 @@ public:
             u32 limit;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
 
@@ -1932,8 +1919,7 @@ public:
             u32 limit;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
 
@@ -1981,8 +1967,7 @@ public:
             u32 address_low;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
 
@@ -2027,8 +2012,7 @@ public:
             u32 address_low;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
 
@@ -2224,19 +2208,16 @@ public:
             }
 
             GPUVAddr StartAddress() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(start_addr_high) << 32) |
-                                             start_addr_low);
+                return (GPUVAddr{start_addr_high} << 32) | GPUVAddr{start_addr_low};
             }
 
             GPUVAddr EndAddress() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(limit_addr_high) << 32) |
-                                             limit_addr_low);
+                return (GPUVAddr{limit_addr_high} << 32) | GPUVAddr{limit_addr_low};
             }
 
             /// Adjust the index buffer offset so it points to the first desired index.
             GPUVAddr IndexStart() const {
-                return StartAddress() +
-                       static_cast<size_t>(first) * static_cast<size_t>(FormatSizeInBytes());
+                return StartAddress() + size_t{first} * size_t{FormatSizeInBytes()};
             }
         };
 
@@ -2464,8 +2445,7 @@ public:
             } query;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
 
@@ -2479,8 +2459,7 @@ public:
             u32 frequency;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
 
             bool IsEnabled() const {
@@ -2494,8 +2473,7 @@ public:
             u32 address_low;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
         static_assert(sizeof(VertexStreamLimit) == 0x8);
@@ -2543,8 +2521,7 @@ public:
             std::array<u32, NumCBData> buffer;
 
             GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
+                return (GPUVAddr{address_high} << 32) | GPUVAddr{address_low};
             }
         };
 
@@ -3148,10 +3125,12 @@ private:
     /// Handles use of topology overrides (e.g., to avoid using a topology assigned from a macro)
     void ProcessTopologyOverride();
 
-    void ProcessDraw(u32 instance_count = 1);
-
+    /// Handles deferred draw(e.g., instance draw).
     void ProcessDeferredDraw();
 
+    /// Handles a draw.
+    void ProcessDraw(u32 instance_count = 1);
+
     /// Returns a query's value or an empty object if the value will be deferred through a cache.
     std::optional<u64> GetQueryResult();
 
@@ -3178,6 +3157,8 @@ private:
 
     std::array<bool, Regs::NUM_REGS> draw_command{};
     std::vector<u32> deferred_draw_method;
+    enum class DrawMode : u32 { General = 0, Instance, InlineIndex };
+    DrawMode draw_mode{DrawMode::General};
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 1bf6ca2dd..a189e60ae 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -41,8 +41,8 @@ void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call)
 
 void MaxwellDMA::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
                                  u32 methods_pending) {
-    for (size_t i = 0; i < amount; ++i) {
-        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    for (u32 i = 0; i < amount; ++i) {
+        CallMethod(method, base_start[i], methods_pending - i <= 1);
     }
 }
 
@@ -62,7 +62,8 @@ void MaxwellDMA::Launch() {
 
         if (!is_src_pitch && !is_dst_pitch) {
             // If both the source and the destination are in block layout, assert.
-            UNIMPLEMENTED_MSG("Tiled->Tiled DMA transfers are not yet implemented");
+            CopyBlockLinearToBlockLinear();
+            ReleaseSemaphore();
             return;
         }
 
@@ -93,14 +94,14 @@ void MaxwellDMA::Launch() {
                                             reinterpret_cast<u8*>(tmp_buffer.data()),
                                             regs.line_length_in * sizeof(u32));
         } else {
-            auto convert_linear_2_blocklinear_addr = [](u64 address) {
+            const auto convert_linear_2_blocklinear_addr = [](u64 address) {
                 return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) |
                        ((address & 0x180) >> 1) | ((address & 0x20) << 3);
             };
-            auto src_kind = memory_manager.GetPageKind(regs.offset_in);
-            auto dst_kind = memory_manager.GetPageKind(regs.offset_out);
-            const bool is_src_pitch = IsPitchKind(static_cast<PTEKind>(src_kind));
-            const bool is_dst_pitch = IsPitchKind(static_cast<PTEKind>(dst_kind));
+            const auto src_kind = memory_manager.GetPageKind(regs.offset_in);
+            const auto dst_kind = memory_manager.GetPageKind(regs.offset_out);
+            const bool is_src_pitch = IsPitchKind(src_kind);
+            const bool is_dst_pitch = IsPitchKind(dst_kind);
             if (!is_src_pitch && is_dst_pitch) {
                 UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0);
                 UNIMPLEMENTED_IF(regs.offset_in % 16 != 0);
@@ -291,6 +292,70 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
     memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
 }
 
+void MaxwellDMA::CopyBlockLinearToBlockLinear() {
+    UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
+
+    const bool is_remapping = regs.launch_dma.remap_enable != 0;
+
+    // Deswizzle the input and copy it over.
+    const Parameters& src = regs.src_params;
+    const Parameters& dst = regs.dst_params;
+
+    const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
+    const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
+
+    const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
+
+    u32 src_width = src.width;
+    u32 dst_width = dst.width;
+    u32 x_elements = regs.line_length_in;
+    u32 src_x_offset = src.origin.x;
+    u32 dst_x_offset = dst.origin.x;
+    u32 bpp_shift = 0U;
+    if (!is_remapping) {
+        bpp_shift = Common::FoldRight(
+            4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
+            src_width, dst_width, x_elements, src_x_offset, dst_x_offset,
+            static_cast<u32>(regs.offset_in), static_cast<u32>(regs.offset_out));
+        src_width >>= bpp_shift;
+        dst_width >>= bpp_shift;
+        x_elements >>= bpp_shift;
+        src_x_offset >>= bpp_shift;
+        dst_x_offset >>= bpp_shift;
+    }
+
+    const u32 bytes_per_pixel = base_bpp << bpp_shift;
+    const size_t src_size = CalculateSize(true, bytes_per_pixel, src_width, src.height, src.depth,
+                                          src.block_size.height, src.block_size.depth);
+    const size_t dst_size = CalculateSize(true, bytes_per_pixel, dst_width, dst.height, dst.depth,
+                                          dst.block_size.height, dst.block_size.depth);
+
+    const u32 pitch = x_elements * bytes_per_pixel;
+    const size_t mid_buffer_size = pitch * regs.line_count;
+
+    if (read_buffer.size() < src_size) {
+        read_buffer.resize(src_size);
+    }
+    if (write_buffer.size() < dst_size) {
+        write_buffer.resize(dst_size);
+    }
+
+    intermediate_buffer.resize(mid_buffer_size);
+
+    memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
+    memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+
+    UnswizzleSubrect(intermediate_buffer, read_buffer, bytes_per_pixel, src_width, src.height,
+                     src.depth, src_x_offset, src.origin.y, x_elements, regs.line_count,
+                     src.block_size.height, src.block_size.depth, pitch);
+
+    SwizzleSubrect(write_buffer, intermediate_buffer, bytes_per_pixel, dst_width, dst.height,
+                   dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
+                   dst.block_size.height, dst.block_size.depth, pitch);
+
+    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+}
+
 void MaxwellDMA::ReleaseSemaphore() {
     const auto type = regs.launch_dma.semaphore_type;
     const GPUVAddr address = regs.semaphore.address;
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 953e34adc..d40d3d302 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -223,6 +223,8 @@ private:
 
     void CopyPitchToBlockLinear();
 
+    void CopyBlockLinearToBlockLinear();
+
     void FastCopyBlockLinearToPitch();
 
     void ReleaseSemaphore();
@@ -234,6 +236,7 @@ private:
 
     std::vector<u8> read_buffer;
     std::vector<u8> write_buffer;
+    std::vector<u8> intermediate_buffer;
 
     static constexpr std::size_t NUM_REGS = 0x800;
     struct Regs {
diff --git a/src/video_core/engines/puller.cpp b/src/video_core/engines/puller.cpp
index 4d2278811..7718a09b3 100644
--- a/src/video_core/engines/puller.cpp
+++ b/src/video_core/engines/puller.cpp
@@ -31,7 +31,7 @@ void Puller::ProcessBindMethod(const MethodCall& method_call) {
     LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel,
               method_call.argument);
     const auto engine_id = static_cast<EngineID>(method_call.argument);
-    bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id);
+    bound_engines[method_call.subchannel] = engine_id;
     switch (engine_id) {
     case EngineID::FERMI_TWOD_A:
         dma_pusher.BindSubchannel(channel_state.fermi_2d.get(), method_call.subchannel);
@@ -118,7 +118,7 @@ void Puller::ProcessSemaphoreRelease() {
     std::function<void()> operation([this, sequence_address, payload] {
         memory_manager.Write<u32>(sequence_address, payload);
     });
-    rasterizer->SyncOperation(std::move(operation));
+    rasterizer->SignalFence(std::move(operation));
 }
 
 void Puller::ProcessSemaphoreAcquire() {
@@ -151,8 +151,8 @@ void Puller::CallPullerMethod(const MethodCall& method_call) {
     case BufferMethods::SemaphoreAddressLow:
     case BufferMethods::SemaphoreSequencePayload:
     case BufferMethods::SyncpointPayload:
-        break;
     case BufferMethods::WrcacheFlush:
+        break;
     case BufferMethods::RefCnt:
         rasterizer->SignalReference();
         break;
@@ -285,12 +285,12 @@ void Puller::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start,
     if (ExecuteMethodOnEngine(method)) {
         CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending);
     } else {
-        for (std::size_t i = 0; i < amount; i++) {
+        for (u32 i = 0; i < amount; i++) {
             CallPullerMethod(MethodCall{
                 method,
                 base_start[i],
                 subchannel,
-                methods_pending - static_cast<u32>(i),
+                methods_pending - i,
             });
         }
     }
diff --git a/src/video_core/engines/sw_blitter/blitter.cpp b/src/video_core/engines/sw_blitter/blitter.cpp
new file mode 100644
index 000000000..2f1ea4626
--- /dev/null
+++ b/src/video_core/engines/sw_blitter/blitter.cpp
@@ -0,0 +1,238 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "video_core/engines/sw_blitter/blitter.h"
+#include "video_core/engines/sw_blitter/converter.h"
+#include "video_core/memory_manager.h"
+#include "video_core/surface.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+using VideoCore::Surface::BytesPerBlock;
+using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
+
+namespace Tegra::Engines::Blitter {
+
+using namespace Texture;
+
+namespace {
+
+constexpr size_t ir_components = 4;
+
+void NearestNeighbor(std::span<const u8> input, std::span<u8> output, u32 src_width, u32 src_height,
+                     u32 dst_width, u32 dst_height, size_t bpp) {
+    const size_t dx_du = std::llround((static_cast<f64>(src_width) / dst_width) * (1ULL << 32));
+    const size_t dy_dv = std::llround((static_cast<f64>(src_height) / dst_height) * (1ULL << 32));
+    size_t src_y = 0;
+    for (u32 y = 0; y < dst_height; y++) {
+        size_t src_x = 0;
+        for (u32 x = 0; x < dst_width; x++) {
+            const size_t read_from = ((src_y * src_width + src_x) >> 32) * bpp;
+            const size_t write_to = (y * dst_width + x) * bpp;
+
+            std::memcpy(&output[write_to], &input[read_from], bpp);
+            src_x += dx_du;
+        }
+        src_y += dy_dv;
+    }
+}
+
+void NearestNeighborFast(std::span<const f32> input, std::span<f32> output, u32 src_width,
+                         u32 src_height, u32 dst_width, u32 dst_height) {
+    const size_t dx_du = std::llround((static_cast<f64>(src_width) / dst_width) * (1ULL << 32));
+    const size_t dy_dv = std::llround((static_cast<f64>(src_height) / dst_height) * (1ULL << 32));
+    size_t src_y = 0;
+    for (u32 y = 0; y < dst_height; y++) {
+        size_t src_x = 0;
+        for (u32 x = 0; x < dst_width; x++) {
+            const size_t read_from = ((src_y * src_width + src_x) >> 32) * ir_components;
+            const size_t write_to = (y * dst_width + x) * ir_components;
+
+            std::memcpy(&output[write_to], &input[read_from], sizeof(f32) * ir_components);
+            src_x += dx_du;
+        }
+        src_y += dy_dv;
+    }
+}
+
+void Bilinear(std::span<const f32> input, std::span<f32> output, size_t src_width,
+              size_t src_height, size_t dst_width, size_t dst_height) {
+    const auto bilinear_sample = [](std::span<const f32> x0_y0, std::span<const f32> x1_y0,
+                                    std::span<const f32> x0_y1, std::span<const f32> x1_y1,
+                                    f32 weight_x, f32 weight_y) {
+        std::array<f32, ir_components> result{};
+        for (size_t i = 0; i < ir_components; i++) {
+            const f32 a = std::lerp(x0_y0[i], x1_y0[i], weight_x);
+            const f32 b = std::lerp(x0_y1[i], x1_y1[i], weight_x);
+            result[i] = std::lerp(a, b, weight_y);
+        }
+        return result;
+    };
+    const f32 dx_du =
+        dst_width > 1 ? static_cast<f32>(src_width - 1) / static_cast<f32>(dst_width - 1) : 0.f;
+    const f32 dy_dv =
+        dst_height > 1 ? static_cast<f32>(src_height - 1) / static_cast<f32>(dst_height - 1) : 0.f;
+    for (u32 y = 0; y < dst_height; y++) {
+        for (u32 x = 0; x < dst_width; x++) {
+            const f32 x_low = std::floor(static_cast<f32>(x) * dx_du);
+            const f32 y_low = std::floor(static_cast<f32>(y) * dy_dv);
+            const f32 x_high = std::ceil(static_cast<f32>(x) * dx_du);
+            const f32 y_high = std::ceil(static_cast<f32>(y) * dy_dv);
+            const f32 weight_x = (static_cast<f32>(x) * dx_du) - x_low;
+            const f32 weight_y = (static_cast<f32>(y) * dy_dv) - y_low;
+
+            const auto read_src = [&](f32 in_x, f32 in_y) {
+                const size_t read_from =
+                    ((static_cast<size_t>(in_x) * src_width + static_cast<size_t>(in_y)) >> 32) *
+                    ir_components;
+                return std::span<const f32>(&input[read_from], ir_components);
+            };
+
+            auto x0_y0 = read_src(x_low, y_low);
+            auto x1_y0 = read_src(x_high, y_low);
+            auto x0_y1 = read_src(x_low, y_high);
+            auto x1_y1 = read_src(x_high, y_high);
+
+            const auto result = bilinear_sample(x0_y0, x1_y0, x0_y1, x1_y1, weight_x, weight_y);
+
+            const size_t write_to = (y * dst_width + x) * ir_components;
+
+            std::memcpy(&output[write_to], &result, sizeof(f32) * ir_components);
+        }
+    }
+}
+
+} // namespace
+
+struct SoftwareBlitEngine::BlitEngineImpl {
+    std::vector<u8> tmp_buffer;
+    std::vector<u8> src_buffer;
+    std::vector<u8> dst_buffer;
+    std::vector<f32> intermediate_src;
+    std::vector<f32> intermediate_dst;
+    ConverterFactory converter_factory;
+};
+
+SoftwareBlitEngine::SoftwareBlitEngine(MemoryManager& memory_manager_)
+    : memory_manager{memory_manager_} {
+    impl = std::make_unique<BlitEngineImpl>();
+}
+
+SoftwareBlitEngine::~SoftwareBlitEngine() = default;
+
+bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
+                              Fermi2D::Config& config) {
+    const auto get_surface_size = [](Fermi2D::Surface& surface, u32 bytes_per_pixel) {
+        if (surface.linear == Fermi2D::MemoryLayout::BlockLinear) {
+            return CalculateSize(true, bytes_per_pixel, surface.width, surface.height,
+                                 surface.depth, surface.block_height, surface.block_depth);
+        }
+        return static_cast<size_t>(surface.pitch * surface.height);
+    };
+    const auto process_pitch_linear = [](bool unpack, std::span<const u8> input,
+                                         std::span<u8> output, u32 extent_x, u32 extent_y,
+                                         u32 pitch, u32 x0, u32 y0, size_t bpp) {
+        const size_t base_offset = x0 * bpp;
+        const size_t copy_size = extent_x * bpp;
+        for (u32 y = y0; y < extent_y; y++) {
+            const size_t first_offset = y * pitch + base_offset;
+            const size_t second_offset = y * extent_x * bpp;
+            u8* write_to = unpack ? &output[first_offset] : &output[second_offset];
+            const u8* read_from = unpack ? &input[second_offset] : &input[first_offset];
+            std::memcpy(write_to, read_from, copy_size);
+        }
+    };
+
+    const u32 src_extent_x = config.src_x1 - config.src_x0;
+    const u32 src_extent_y = config.src_y1 - config.src_y0;
+
+    const u32 dst_extent_x = config.dst_x1 - config.dst_x0;
+    const u32 dst_extent_y = config.dst_y1 - config.dst_y0;
+    const auto src_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
+    const auto dst_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(dst.format));
+    const size_t src_size = get_surface_size(src, src_bytes_per_pixel);
+    impl->tmp_buffer.resize(src_size);
+    memory_manager.ReadBlock(src.Address(), impl->tmp_buffer.data(), src_size);
+
+    const size_t src_copy_size = src_extent_x * src_extent_y * src_bytes_per_pixel;
+
+    const size_t dst_copy_size = dst_extent_x * dst_extent_y * dst_bytes_per_pixel;
+
+    impl->src_buffer.resize(src_copy_size);
+
+    const bool no_passthrough =
+        src.format != dst.format || src_extent_x != dst_extent_x || src_extent_y != dst_extent_y;
+
+    const auto convertion_phase_same_format = [&]() {
+        NearestNeighbor(impl->src_buffer, impl->dst_buffer, src_extent_x, src_extent_y,
+                        dst_extent_x, dst_extent_y, dst_bytes_per_pixel);
+    };
+
+    const auto convertion_phase_ir = [&]() {
+        auto* input_converter = impl->converter_factory.GetFormatConverter(src.format);
+        impl->intermediate_src.resize((src_copy_size / src_bytes_per_pixel) * ir_components);
+        impl->intermediate_dst.resize((dst_copy_size / dst_bytes_per_pixel) * ir_components);
+        input_converter->ConvertTo(impl->src_buffer, impl->intermediate_src);
+
+        if (config.filter != Fermi2D::Filter::Bilinear) {
+            NearestNeighborFast(impl->intermediate_src, impl->intermediate_dst, src_extent_x,
+                                src_extent_y, dst_extent_x, dst_extent_y);
+        } else {
+            Bilinear(impl->intermediate_src, impl->intermediate_dst, src_extent_x, src_extent_y,
+                     dst_extent_x, dst_extent_y);
+        }
+
+        auto* output_converter = impl->converter_factory.GetFormatConverter(dst.format);
+        output_converter->ConvertFrom(impl->intermediate_dst, impl->dst_buffer);
+    };
+
+    // Do actuall Blit
+
+    impl->dst_buffer.resize(dst_copy_size);
+    if (src.linear == Fermi2D::MemoryLayout::BlockLinear) {
+        UnswizzleSubrect(impl->src_buffer, impl->tmp_buffer, src_bytes_per_pixel, src.width,
+                         src.height, src.depth, config.src_x0, config.src_y0, src_extent_x,
+                         src_extent_y, src.block_height, src.block_depth,
+                         src_extent_x * src_bytes_per_pixel);
+    } else {
+        process_pitch_linear(false, impl->tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y,
+                             src.pitch, config.src_x0, config.src_y0, src_bytes_per_pixel);
+    }
+
+    // Conversion Phase
+    if (no_passthrough) {
+        if (src.format != dst.format || config.filter == Fermi2D::Filter::Bilinear) {
+            convertion_phase_ir();
+        } else {
+            convertion_phase_same_format();
+        }
+    } else {
+        impl->dst_buffer.swap(impl->src_buffer);
+    }
+
+    const size_t dst_size = get_surface_size(dst, dst_bytes_per_pixel);
+    impl->tmp_buffer.resize(dst_size);
+    memory_manager.ReadBlock(dst.Address(), impl->tmp_buffer.data(), dst_size);
+
+    if (dst.linear == Fermi2D::MemoryLayout::BlockLinear) {
+        SwizzleSubrect(impl->tmp_buffer, impl->dst_buffer, dst_bytes_per_pixel, dst.width,
+                       dst.height, dst.depth, config.dst_x0, config.dst_y0, dst_extent_x,
+                       dst_extent_y, dst.block_height, dst.block_depth,
+                       dst_extent_x * dst_bytes_per_pixel);
+    } else {
+        process_pitch_linear(true, impl->dst_buffer, impl->tmp_buffer, dst_extent_x, dst_extent_y,
+                             dst.pitch, config.dst_x0, config.dst_y0,
+                             static_cast<size_t>(dst_bytes_per_pixel));
+    }
+    memory_manager.WriteBlock(dst.Address(), impl->tmp_buffer.data(), dst_size);
+    return true;
+}
+
+} // namespace Tegra::Engines::Blitter
diff --git a/src/video_core/engines/sw_blitter/blitter.h b/src/video_core/engines/sw_blitter/blitter.h
new file mode 100644
index 000000000..85b55c836
--- /dev/null
+++ b/src/video_core/engines/sw_blitter/blitter.h
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+
+#include "video_core/engines/fermi_2d.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace Tegra::Engines::Blitter {
+
+class SoftwareBlitEngine {
+public:
+    explicit SoftwareBlitEngine(MemoryManager& memory_manager_);
+    ~SoftwareBlitEngine();
+
+    bool Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst, Fermi2D::Config& copy_config);
+
+private:
+    MemoryManager& memory_manager;
+    struct BlitEngineImpl;
+    std::unique_ptr<BlitEngineImpl> impl;
+};
+
+} // namespace Tegra::Engines::Blitter
diff --git a/src/video_core/engines/sw_blitter/converter.cpp b/src/video_core/engines/sw_blitter/converter.cpp
new file mode 100644
index 000000000..cd46dfd4f
--- /dev/null
+++ b/src/video_core/engines/sw_blitter/converter.cpp
@@ -0,0 +1,1234 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include <array>
+#include <bit>
+#include <cmath>
+#include <span>
+#include <unordered_map>
+
+#include "common/assert.h"
+#include "video_core/engines/sw_blitter/converter.h"
+#include "video_core/surface.h"
+#include "video_core/textures/decoders.h"
+
+#ifdef _MSC_VER
+#define FORCE_INLINE __forceinline
+#else
+#define FORCE_INLINE inline __attribute__((always_inline))
+#endif
+
+namespace Tegra::Engines::Blitter {
+
+enum class Swizzle : size_t {
+    R = 0,
+    G = 1,
+    B = 2,
+    A = 3,
+    None,
+};
+
+enum class ComponentType : u32 {
+    SNORM = 1,
+    UNORM = 2,
+    SINT = 3,
+    UINT = 4,
+    SNORM_FORCE_FP16 = 5,
+    UNORM_FORCE_FP16 = 6,
+    FLOAT = 7,
+    SRGB = 8,
+};
+
+namespace {
+
+/*
+ * Note: Use generate_converters.py to generate the structs and searches for new render target
+ * formats and copy paste them to this file in order to update. just call "python
+ * generate_converters.py" and get the code from the output. modify the file to add new formats.
+ */
+
+constexpr std::array<f32, 256> SRGB_TO_RGB_LUT = {
+    0.000000e+00f, 3.035270e-04f, 6.070540e-04f, 9.105810e-04f, 1.214108e-03f, 1.517635e-03f,
+    1.821162e-03f, 2.124689e-03f, 2.428216e-03f, 2.731743e-03f, 3.035270e-03f, 3.346536e-03f,
+    3.676507e-03f, 4.024717e-03f, 4.391442e-03f, 4.776953e-03f, 5.181517e-03f, 5.605392e-03f,
+    6.048833e-03f, 6.512091e-03f, 6.995410e-03f, 7.499032e-03f, 8.023193e-03f, 8.568126e-03f,
+    9.134059e-03f, 9.721218e-03f, 1.032982e-02f, 1.096009e-02f, 1.161224e-02f, 1.228649e-02f,
+    1.298303e-02f, 1.370208e-02f, 1.444384e-02f, 1.520851e-02f, 1.599629e-02f, 1.680738e-02f,
+    1.764195e-02f, 1.850022e-02f, 1.938236e-02f, 2.028856e-02f, 2.121901e-02f, 2.217389e-02f,
+    2.315337e-02f, 2.415763e-02f, 2.518686e-02f, 2.624122e-02f, 2.732089e-02f, 2.842604e-02f,
+    2.955684e-02f, 3.071344e-02f, 3.189603e-02f, 3.310477e-02f, 3.433981e-02f, 3.560131e-02f,
+    3.688945e-02f, 3.820437e-02f, 3.954624e-02f, 4.091520e-02f, 4.231141e-02f, 4.373503e-02f,
+    4.518620e-02f, 4.666509e-02f, 4.817183e-02f, 4.970657e-02f, 5.126946e-02f, 5.286065e-02f,
+    5.448028e-02f, 5.612849e-02f, 5.780543e-02f, 5.951124e-02f, 6.124605e-02f, 6.301001e-02f,
+    6.480327e-02f, 6.662594e-02f, 6.847817e-02f, 7.036009e-02f, 7.227185e-02f, 7.421357e-02f,
+    7.618538e-02f, 7.818742e-02f, 8.021982e-02f, 8.228271e-02f, 8.437621e-02f, 8.650046e-02f,
+    8.865558e-02f, 9.084171e-02f, 9.305897e-02f, 9.530747e-02f, 9.758735e-02f, 9.989873e-02f,
+    1.022417e-01f, 1.046165e-01f, 1.070231e-01f, 1.094617e-01f, 1.119324e-01f, 1.144354e-01f,
+    1.169707e-01f, 1.195384e-01f, 1.221388e-01f, 1.247718e-01f, 1.274377e-01f, 1.301365e-01f,
+    1.328683e-01f, 1.356333e-01f, 1.384316e-01f, 1.412633e-01f, 1.441285e-01f, 1.470273e-01f,
+    1.499598e-01f, 1.529261e-01f, 1.559265e-01f, 1.589608e-01f, 1.620294e-01f, 1.651322e-01f,
+    1.682694e-01f, 1.714411e-01f, 1.746474e-01f, 1.778884e-01f, 1.811642e-01f, 1.844750e-01f,
+    1.878208e-01f, 1.912017e-01f, 1.946178e-01f, 1.980693e-01f, 2.015563e-01f, 2.050787e-01f,
+    2.086369e-01f, 2.122308e-01f, 2.158605e-01f, 2.195262e-01f, 2.232280e-01f, 2.269659e-01f,
+    2.307401e-01f, 2.345506e-01f, 2.383976e-01f, 2.422811e-01f, 2.462013e-01f, 2.501583e-01f,
+    2.541521e-01f, 2.581829e-01f, 2.622507e-01f, 2.663556e-01f, 2.704978e-01f, 2.746773e-01f,
+    2.788943e-01f, 2.831487e-01f, 2.874408e-01f, 2.917706e-01f, 2.961383e-01f, 3.005438e-01f,
+    3.049873e-01f, 3.094689e-01f, 3.139887e-01f, 3.185468e-01f, 3.231432e-01f, 3.277781e-01f,
+    3.324515e-01f, 3.371636e-01f, 3.419144e-01f, 3.467041e-01f, 3.515326e-01f, 3.564001e-01f,
+    3.613068e-01f, 3.662526e-01f, 3.712377e-01f, 3.762621e-01f, 3.813260e-01f, 3.864294e-01f,
+    3.915725e-01f, 3.967552e-01f, 4.019778e-01f, 4.072402e-01f, 4.125426e-01f, 4.178851e-01f,
+    4.232677e-01f, 4.286905e-01f, 4.341536e-01f, 4.396572e-01f, 4.452012e-01f, 4.507858e-01f,
+    4.564110e-01f, 4.620770e-01f, 4.677838e-01f, 4.735315e-01f, 4.793202e-01f, 4.851499e-01f,
+    4.910209e-01f, 4.969330e-01f, 5.028865e-01f, 5.088813e-01f, 5.149177e-01f, 5.209956e-01f,
+    5.271151e-01f, 5.332764e-01f, 5.394795e-01f, 5.457245e-01f, 5.520114e-01f, 5.583404e-01f,
+    5.647115e-01f, 5.711249e-01f, 5.775805e-01f, 5.840784e-01f, 5.906188e-01f, 5.972018e-01f,
+    6.038274e-01f, 6.104956e-01f, 6.172066e-01f, 6.239604e-01f, 6.307572e-01f, 6.375968e-01f,
+    6.444797e-01f, 6.514056e-01f, 6.583748e-01f, 6.653873e-01f, 6.724432e-01f, 6.795425e-01f,
+    6.866853e-01f, 6.938717e-01f, 7.011019e-01f, 7.083758e-01f, 7.156935e-01f, 7.230551e-01f,
+    7.304608e-01f, 7.379104e-01f, 7.454042e-01f, 7.529422e-01f, 7.605245e-01f, 7.681512e-01f,
+    7.758222e-01f, 7.835378e-01f, 7.912979e-01f, 7.991027e-01f, 8.069522e-01f, 8.148466e-01f,
+    8.227857e-01f, 8.307699e-01f, 8.387990e-01f, 8.468732e-01f, 8.549926e-01f, 8.631572e-01f,
+    8.713671e-01f, 8.796224e-01f, 8.879231e-01f, 8.962694e-01f, 9.046612e-01f, 9.130986e-01f,
+    9.215819e-01f, 9.301109e-01f, 9.386857e-01f, 9.473065e-01f, 9.559733e-01f, 9.646863e-01f,
+    9.734453e-01f, 9.822506e-01f, 9.911021e-01f, 1.000000e+00f};
+
+constexpr std::array<f32, 256> RGB_TO_SRGB_LUT = {
+    0.000000e+00f, 4.984009e-02f, 8.494473e-02f, 1.107021e-01f, 1.318038e-01f, 1.500052e-01f,
+    1.661857e-01f, 1.808585e-01f, 1.943532e-01f, 2.068957e-01f, 2.186491e-01f, 2.297351e-01f,
+    2.402475e-01f, 2.502604e-01f, 2.598334e-01f, 2.690152e-01f, 2.778465e-01f, 2.863614e-01f,
+    2.945889e-01f, 3.025538e-01f, 3.102778e-01f, 3.177796e-01f, 3.250757e-01f, 3.321809e-01f,
+    3.391081e-01f, 3.458689e-01f, 3.524737e-01f, 3.589320e-01f, 3.652521e-01f, 3.714419e-01f,
+    3.775084e-01f, 3.834581e-01f, 3.892968e-01f, 3.950301e-01f, 4.006628e-01f, 4.061998e-01f,
+    4.116451e-01f, 4.170030e-01f, 4.222770e-01f, 4.274707e-01f, 4.325873e-01f, 4.376298e-01f,
+    4.426010e-01f, 4.475037e-01f, 4.523403e-01f, 4.571131e-01f, 4.618246e-01f, 4.664766e-01f,
+    4.710712e-01f, 4.756104e-01f, 4.800958e-01f, 4.845292e-01f, 4.889122e-01f, 4.932462e-01f,
+    4.975329e-01f, 5.017734e-01f, 5.059693e-01f, 5.101216e-01f, 5.142317e-01f, 5.183006e-01f,
+    5.223295e-01f, 5.263194e-01f, 5.302714e-01f, 5.341862e-01f, 5.380651e-01f, 5.419087e-01f,
+    5.457181e-01f, 5.494938e-01f, 5.532369e-01f, 5.569480e-01f, 5.606278e-01f, 5.642771e-01f,
+    5.678965e-01f, 5.714868e-01f, 5.750484e-01f, 5.785821e-01f, 5.820884e-01f, 5.855680e-01f,
+    5.890211e-01f, 5.924487e-01f, 5.958509e-01f, 5.992285e-01f, 6.025819e-01f, 6.059114e-01f,
+    6.092176e-01f, 6.125010e-01f, 6.157619e-01f, 6.190008e-01f, 6.222180e-01f, 6.254140e-01f,
+    6.285890e-01f, 6.317436e-01f, 6.348780e-01f, 6.379926e-01f, 6.410878e-01f, 6.441637e-01f,
+    6.472208e-01f, 6.502595e-01f, 6.532799e-01f, 6.562824e-01f, 6.592672e-01f, 6.622347e-01f,
+    6.651851e-01f, 6.681187e-01f, 6.710356e-01f, 6.739363e-01f, 6.768209e-01f, 6.796897e-01f,
+    6.825429e-01f, 6.853807e-01f, 6.882034e-01f, 6.910111e-01f, 6.938041e-01f, 6.965826e-01f,
+    6.993468e-01f, 7.020969e-01f, 7.048331e-01f, 7.075556e-01f, 7.102645e-01f, 7.129600e-01f,
+    7.156424e-01f, 7.183118e-01f, 7.209683e-01f, 7.236121e-01f, 7.262435e-01f, 7.288625e-01f,
+    7.314693e-01f, 7.340640e-01f, 7.366470e-01f, 7.392181e-01f, 7.417776e-01f, 7.443256e-01f,
+    7.468624e-01f, 7.493880e-01f, 7.519025e-01f, 7.544061e-01f, 7.568989e-01f, 7.593810e-01f,
+    7.618526e-01f, 7.643137e-01f, 7.667645e-01f, 7.692052e-01f, 7.716358e-01f, 7.740564e-01f,
+    7.764671e-01f, 7.788681e-01f, 7.812595e-01f, 7.836413e-01f, 7.860138e-01f, 7.883768e-01f,
+    7.907307e-01f, 7.930754e-01f, 7.954110e-01f, 7.977377e-01f, 8.000556e-01f, 8.023647e-01f,
+    8.046651e-01f, 8.069569e-01f, 8.092403e-01f, 8.115152e-01f, 8.137818e-01f, 8.160402e-01f,
+    8.182903e-01f, 8.205324e-01f, 8.227665e-01f, 8.249926e-01f, 8.272109e-01f, 8.294214e-01f,
+    8.316242e-01f, 8.338194e-01f, 8.360070e-01f, 8.381871e-01f, 8.403597e-01f, 8.425251e-01f,
+    8.446831e-01f, 8.468339e-01f, 8.489776e-01f, 8.511142e-01f, 8.532437e-01f, 8.553662e-01f,
+    8.574819e-01f, 8.595907e-01f, 8.616927e-01f, 8.637881e-01f, 8.658767e-01f, 8.679587e-01f,
+    8.700342e-01f, 8.721032e-01f, 8.741657e-01f, 8.762218e-01f, 8.782716e-01f, 8.803151e-01f,
+    8.823524e-01f, 8.843835e-01f, 8.864085e-01f, 8.884274e-01f, 8.904402e-01f, 8.924471e-01f,
+    8.944480e-01f, 8.964431e-01f, 8.984324e-01f, 9.004158e-01f, 9.023935e-01f, 9.043654e-01f,
+    9.063318e-01f, 9.082925e-01f, 9.102476e-01f, 9.121972e-01f, 9.141413e-01f, 9.160800e-01f,
+    9.180133e-01f, 9.199412e-01f, 9.218637e-01f, 9.237810e-01f, 9.256931e-01f, 9.276000e-01f,
+    9.295017e-01f, 9.313982e-01f, 9.332896e-01f, 9.351761e-01f, 9.370575e-01f, 9.389339e-01f,
+    9.408054e-01f, 9.426719e-01f, 9.445336e-01f, 9.463905e-01f, 9.482424e-01f, 9.500897e-01f,
+    9.519322e-01f, 9.537700e-01f, 9.556032e-01f, 9.574316e-01f, 9.592555e-01f, 9.610748e-01f,
+    9.628896e-01f, 9.646998e-01f, 9.665055e-01f, 9.683068e-01f, 9.701037e-01f, 9.718961e-01f,
+    9.736842e-01f, 9.754679e-01f, 9.772474e-01f, 9.790225e-01f, 9.807934e-01f, 9.825601e-01f,
+    9.843225e-01f, 9.860808e-01f, 9.878350e-01f, 9.895850e-01f, 9.913309e-01f, 9.930727e-01f,
+    9.948106e-01f, 9.965444e-01f, 9.982741e-01f, 1.000000e+00f};
+
+} // namespace
+
+struct R32G32B32A32_FLOATTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::FLOAT, ComponentType::FLOAT, ComponentType::FLOAT, ComponentType::FLOAT};
+    static constexpr std::array<size_t, num_components> component_sizes = {32, 32, 32, 32};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B, Swizzle::A};
+};
+
+struct R32G32B32A32_SINTTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SINT, ComponentType::SINT, ComponentType::SINT, ComponentType::SINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {32, 32, 32, 32};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B, Swizzle::A};
+};
+
+struct R32G32B32A32_UINTTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UINT, ComponentType::UINT, ComponentType::UINT, ComponentType::UINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {32, 32, 32, 32};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B, Swizzle::A};
+};
+
+struct R32G32B32X32_FLOATTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::FLOAT, ComponentType::FLOAT, ComponentType::FLOAT, ComponentType::FLOAT};
+    static constexpr std::array<size_t, num_components> component_sizes = {32, 32, 32, 32};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B, Swizzle::None};
+};
+
+struct R32G32B32X32_SINTTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SINT, ComponentType::SINT, ComponentType::SINT, ComponentType::SINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {32, 32, 32, 32};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B, Swizzle::None};
+};
+
+struct R32G32B32X32_UINTTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UINT, ComponentType::UINT, ComponentType::UINT, ComponentType::UINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {32, 32, 32, 32};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B, Swizzle::None};
+};
+
+struct R16G16B16A16_UNORMTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {16, 16, 16, 16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B, Swizzle::A};
+};
+
+struct R16G16B16A16_SNORMTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SNORM, ComponentType::SNORM, ComponentType::SNORM, ComponentType::SNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {16, 16, 16, 16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B, Swizzle::A};
+};
+
+struct R16G16B16A16_SINTTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SINT, ComponentType::SINT, ComponentType::SINT, ComponentType::SINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {16, 16, 16, 16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B, Swizzle::A};
+};
+
+struct R16G16B16A16_UINTTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UINT, ComponentType::UINT, ComponentType::UINT, ComponentType::UINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {16, 16, 16, 16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B, Swizzle::A};
+};
+
+struct R16G16B16A16_FLOATTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::FLOAT, ComponentType::FLOAT, ComponentType::FLOAT, ComponentType::FLOAT};
+    static constexpr std::array<size_t, num_components> component_sizes = {16, 16, 16, 16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B, Swizzle::A};
+};
+
+struct R32G32_FLOATTraits {
+    static constexpr size_t num_components = 2;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::FLOAT, ComponentType::FLOAT};
+    static constexpr std::array<size_t, num_components> component_sizes = {32, 32};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R,
+                                                                              Swizzle::G};
+};
+
+struct R32G32_SINTTraits {
+    static constexpr size_t num_components = 2;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SINT, ComponentType::SINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {32, 32};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R,
+                                                                              Swizzle::G};
+};
+
+struct R32G32_UINTTraits {
+    static constexpr size_t num_components = 2;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UINT, ComponentType::UINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {32, 32};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R,
+                                                                              Swizzle::G};
+};
+
+struct R16G16B16X16_FLOATTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::FLOAT, ComponentType::FLOAT, ComponentType::FLOAT, ComponentType::FLOAT};
+    static constexpr std::array<size_t, num_components> component_sizes = {16, 16, 16, 16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B, Swizzle::None};
+};
+
+struct A8R8G8B8_UNORMTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8, 8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::A, Swizzle::R, Swizzle::G, Swizzle::B};
+};
+
+struct A8R8G8B8_SRGBTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SRGB, ComponentType::SRGB, ComponentType::SRGB, ComponentType::SRGB};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8, 8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::A, Swizzle::R, Swizzle::G, Swizzle::B};
+};
+
+struct A2B10G10R10_UNORMTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {2, 10, 10, 10};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::A, Swizzle::B, Swizzle::G, Swizzle::R};
+};
+
+struct A2B10G10R10_UINTTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UINT, ComponentType::UINT, ComponentType::UINT, ComponentType::UINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {2, 10, 10, 10};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::A, Swizzle::B, Swizzle::G, Swizzle::R};
+};
+
+struct A2R10G10B10_UNORMTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {2, 10, 10, 10};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::A, Swizzle::R, Swizzle::G, Swizzle::B};
+};
+
+struct A8B8G8R8_UNORMTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8, 8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::A, Swizzle::B, Swizzle::G, Swizzle::R};
+};
+
+struct A8B8G8R8_SRGBTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SRGB, ComponentType::SRGB, ComponentType::SRGB, ComponentType::SRGB};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8, 8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::A, Swizzle::B, Swizzle::G, Swizzle::R};
+};
+
+struct A8B8G8R8_SNORMTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SNORM, ComponentType::SNORM, ComponentType::SNORM, ComponentType::SNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8, 8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::A, Swizzle::B, Swizzle::G, Swizzle::R};
+};
+
+struct A8B8G8R8_SINTTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SINT, ComponentType::SINT, ComponentType::SINT, ComponentType::SINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8, 8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::A, Swizzle::B, Swizzle::G, Swizzle::R};
+};
+
+struct A8B8G8R8_UINTTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UINT, ComponentType::UINT, ComponentType::UINT, ComponentType::UINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8, 8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::A, Swizzle::B, Swizzle::G, Swizzle::R};
+};
+
+struct R16G16_UNORMTraits {
+    static constexpr size_t num_components = 2;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM, ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {16, 16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R,
+                                                                              Swizzle::G};
+};
+
+struct R16G16_SNORMTraits {
+    static constexpr size_t num_components = 2;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SNORM, ComponentType::SNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {16, 16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R,
+                                                                              Swizzle::G};
+};
+
+struct R16G16_SINTTraits {
+    static constexpr size_t num_components = 2;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SINT, ComponentType::SINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {16, 16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R,
+                                                                              Swizzle::G};
+};
+
+struct R16G16_UINTTraits {
+    static constexpr size_t num_components = 2;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UINT, ComponentType::UINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {16, 16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R,
+                                                                              Swizzle::G};
+};
+
+struct R16G16_FLOATTraits {
+    static constexpr size_t num_components = 2;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::FLOAT, ComponentType::FLOAT};
+    static constexpr std::array<size_t, num_components> component_sizes = {16, 16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R,
+                                                                              Swizzle::G};
+};
+
+struct B10G11R11_FLOATTraits {
+    static constexpr size_t num_components = 3;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::FLOAT, ComponentType::FLOAT, ComponentType::FLOAT};
+    static constexpr std::array<size_t, num_components> component_sizes = {10, 11, 11};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::B, Swizzle::G, Swizzle::R};
+};
+
+struct R32_SINTTraits {
+    static constexpr size_t num_components = 1;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {32};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R};
+};
+
+struct R32_UINTTraits {
+    static constexpr size_t num_components = 1;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {32};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R};
+};
+
+struct R32_FLOATTraits {
+    static constexpr size_t num_components = 1;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::FLOAT};
+    static constexpr std::array<size_t, num_components> component_sizes = {32};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R};
+};
+
+struct X8R8G8B8_UNORMTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8, 8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::None, Swizzle::R, Swizzle::G, Swizzle::B};
+};
+
+struct X8R8G8B8_SRGBTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SRGB, ComponentType::SRGB, ComponentType::SRGB, ComponentType::SRGB};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8, 8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::None, Swizzle::R, Swizzle::G, Swizzle::B};
+};
+
+struct R5G6B5_UNORMTraits {
+    static constexpr size_t num_components = 3;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {5, 6, 5};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::R, Swizzle::G, Swizzle::B};
+};
+
+struct A1R5G5B5_UNORMTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {1, 5, 5, 5};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::A, Swizzle::R, Swizzle::G, Swizzle::B};
+};
+
+struct R8G8_UNORMTraits {
+    static constexpr size_t num_components = 2;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM, ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R,
+                                                                              Swizzle::G};
+};
+
+struct R8G8_SNORMTraits {
+    static constexpr size_t num_components = 2;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SNORM, ComponentType::SNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R,
+                                                                              Swizzle::G};
+};
+
+struct R8G8_SINTTraits {
+    static constexpr size_t num_components = 2;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SINT, ComponentType::SINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R,
+                                                                              Swizzle::G};
+};
+
+struct R8G8_UINTTraits {
+    static constexpr size_t num_components = 2;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UINT, ComponentType::UINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R,
+                                                                              Swizzle::G};
+};
+
+struct R16_UNORMTraits {
+    static constexpr size_t num_components = 1;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R};
+};
+
+struct R16_SNORMTraits {
+    static constexpr size_t num_components = 1;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R};
+};
+
+struct R16_SINTTraits {
+    static constexpr size_t num_components = 1;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R};
+};
+
+struct R16_UINTTraits {
+    static constexpr size_t num_components = 1;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R};
+};
+
+struct R16_FLOATTraits {
+    static constexpr size_t num_components = 1;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::FLOAT};
+    static constexpr std::array<size_t, num_components> component_sizes = {16};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R};
+};
+
+struct R8_UNORMTraits {
+    static constexpr size_t num_components = 1;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R};
+};
+
+struct R8_SNORMTraits {
+    static constexpr size_t num_components = 1;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R};
+};
+
+struct R8_SINTTraits {
+    static constexpr size_t num_components = 1;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R};
+};
+
+struct R8_UINTTraits {
+    static constexpr size_t num_components = 1;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UINT};
+    static constexpr std::array<size_t, num_components> component_sizes = {8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {Swizzle::R};
+};
+
+struct X1R5G5B5_UNORMTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {1, 5, 5, 5};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::None, Swizzle::R, Swizzle::G, Swizzle::B};
+};
+
+struct X8B8G8R8_UNORMTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM, ComponentType::UNORM};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8, 8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::None, Swizzle::B, Swizzle::G, Swizzle::R};
+};
+
+struct X8B8G8R8_SRGBTraits {
+    static constexpr size_t num_components = 4;
+    static constexpr std::array<ComponentType, num_components> component_types = {
+        ComponentType::SRGB, ComponentType::SRGB, ComponentType::SRGB, ComponentType::SRGB};
+    static constexpr std::array<size_t, num_components> component_sizes = {8, 8, 8, 8};
+    static constexpr std::array<Swizzle, num_components> component_swizzle = {
+        Swizzle::None, Swizzle::B, Swizzle::G, Swizzle::R};
+};
+
+template <class ConverterTraits>
+class ConverterImpl : public Converter {
+private:
+    static constexpr size_t num_components = ConverterTraits::num_components;
+    static constexpr std::array<ComponentType, num_components> component_types =
+        ConverterTraits::component_types;
+    static constexpr std::array<size_t, num_components> component_sizes =
+        ConverterTraits::component_sizes;
+    static constexpr std::array<Swizzle, num_components> component_swizzle =
+        ConverterTraits::component_swizzle;
+
+    static constexpr size_t CalculateByteSize() {
+        size_t size = 0;
+        for (const size_t component_size : component_sizes) {
+            size += component_size;
+        }
+        const size_t power = (sizeof(size_t) * 8) - std::countl_zero(size) - 1ULL;
+        const size_t base_size = 1ULL << power;
+        const size_t mask = base_size - 1ULL;
+        return ((size & mask) != 0 ? base_size << 1ULL : base_size) / 8;
+    }
+
+    static constexpr size_t total_bytes_per_pixel = CalculateByteSize();
+    static constexpr size_t total_words_per_pixel =
+        (total_bytes_per_pixel + sizeof(u32) - 1U) / sizeof(u32);
+    static constexpr size_t components_per_ir_rep = 4;
+
+    template <bool get_offsets>
+    static constexpr std::array<size_t, num_components> GetBoundWordsOffsets() {
+        std::array<size_t, num_components> result;
+        result.fill(0);
+        constexpr size_t total_bits_per_word = sizeof(u32) * 8;
+        size_t accumulated_size = 0;
+        size_t count = 0;
+        for (size_t i = 0; i < num_components; i++) {
+            if constexpr (get_offsets) {
+                result[i] = accumulated_size;
+            } else {
+                result[i] = count;
+            }
+            accumulated_size += component_sizes[i];
+            if (accumulated_size > total_bits_per_word) {
+                if constexpr (get_offsets) {
+                    result[i] = 0;
+                } else {
+                    result[i]++;
+                }
+                count++;
+                accumulated_size = component_sizes[i];
+            }
+        }
+        return result;
+    }
+
+    static constexpr std::array<size_t, num_components> bound_words = GetBoundWordsOffsets<false>();
+    static constexpr std::array<size_t, num_components> bound_offsets =
+        GetBoundWordsOffsets<true>();
+
+    static constexpr std::array<u32, num_components> GetComponentsMask() {
+        std::array<u32, num_components> result;
+        for (size_t i = 0; i < num_components; i++) {
+            result[i] = (((u32)~0) >> (8 * sizeof(u32) - component_sizes[i])) << bound_offsets[i];
+        }
+        return result;
+    }
+
+    static constexpr std::array<u32, num_components> component_mask = GetComponentsMask();
+
+    // We are forcing inline so the compiler can SIMD the conversations, since it may do 4 function
+    // calls, it may fail to detect the benefit of inlining.
+    template <size_t which_component>
+    FORCE_INLINE void ConvertToComponent(u32 which_word, f32& out_component) {
+        const u32 value = (which_word >> bound_offsets[which_component]) &
+                          static_cast<u32>((1ULL << component_sizes[which_component]) - 1ULL);
+        const auto sign_extend = [](u32 base_value, size_t bits) {
+            const size_t shift_amount = sizeof(u32) * 8 - bits;
+            s32 shifted_value = static_cast<s32>(base_value << shift_amount);
+            return shifted_value >> shift_amount;
+        };
+        const auto force_to_fp16 = [](f32 base_value) {
+            u32 tmp = std::bit_cast<u32>(base_value);
+            constexpr size_t fp32_mantissa_bits = 23;
+            constexpr size_t fp16_mantissa_bits = 10;
+            constexpr size_t mantissa_mask =
+                ~((1ULL << (fp32_mantissa_bits - fp16_mantissa_bits)) - 1ULL);
+            tmp = tmp & static_cast<u32>(mantissa_mask);
+            // TODO: force the exponent within the range of half float. Not needed in UNORM / SNORM
+            return std::bit_cast<f32>(tmp);
+        };
+        const auto from_fp_n = [&sign_extend](u32 base_value, size_t bits, size_t mantissa) {
+            constexpr size_t fp32_mantissa_bits = 23;
+            size_t shift_towards = fp32_mantissa_bits - mantissa;
+            const u32 new_value =
+                static_cast<u32>(sign_extend(base_value, bits) << shift_towards) & (~(1U << 31));
+            return std::bit_cast<f32>(new_value);
+        };
+        const auto calculate_snorm = [&]() {
+            return static_cast<f32>(
+                static_cast<f32>(sign_extend(value, component_sizes[which_component])) /
+                static_cast<f32>((1ULL << (component_sizes[which_component] - 1ULL)) - 1ULL));
+        };
+        const auto calculate_unorm = [&]() {
+            return static_cast<f32>(
+                static_cast<f32>(value) /
+                static_cast<f32>((1ULL << (component_sizes[which_component])) - 1ULL));
+        };
+        if constexpr (component_types[which_component] == ComponentType::SNORM) {
+            out_component = calculate_snorm();
+        } else if constexpr (component_types[which_component] == ComponentType::UNORM) {
+            out_component = calculate_unorm();
+        } else if constexpr (component_types[which_component] == ComponentType::SINT) {
+            out_component = static_cast<f32>(
+                static_cast<s32>(sign_extend(value, component_sizes[which_component])));
+        } else if constexpr (component_types[which_component] == ComponentType::UINT) {
+            out_component = static_cast<f32>(
+                static_cast<s32>(sign_extend(value, component_sizes[which_component])));
+        } else if constexpr (component_types[which_component] == ComponentType::SNORM_FORCE_FP16) {
+            out_component = calculate_snorm();
+            out_component = force_to_fp16(out_component);
+        } else if constexpr (component_types[which_component] == ComponentType::UNORM_FORCE_FP16) {
+            out_component = calculate_unorm();
+            out_component = force_to_fp16(out_component);
+        } else if constexpr (component_types[which_component] == ComponentType::FLOAT) {
+            if constexpr (component_sizes[which_component] == 32) {
+                out_component = std::bit_cast<f32>(value);
+            } else if constexpr (component_sizes[which_component] == 16) {
+                static constexpr u32 sign_mask = 0x8000;
+                static constexpr u32 mantissa_mask = 0x8000;
+                out_component = std::bit_cast<f32>(((value & sign_mask) << 16) |
+                                                   (((value & 0x7c00) + 0x1C000) << 13) |
+                                                   ((value & mantissa_mask) << 13));
+            } else {
+                out_component = from_fp_n(value, component_sizes[which_component],
+                                          component_sizes[which_component] - 5);
+            }
+        } else if constexpr (component_types[which_component] == ComponentType::SRGB) {
+            if constexpr (component_swizzle[which_component] == Swizzle::A) {
+                out_component = calculate_unorm();
+            } else if constexpr (component_sizes[which_component] == 8) {
+                out_component = SRGB_TO_RGB_LUT[value];
+            } else {
+                out_component = calculate_unorm();
+                UNIMPLEMENTED_MSG("SRGB Conversion with component sizes of {} is unimplemented",
+                                  component_sizes[which_component]);
+            }
+        }
+    }
+
+    // We are forcing inline so the compiler can SIMD the conversations, since it may do 4 function
+    // calls, it may fail to detect the benefit of inlining.
+    template <size_t which_component>
+    FORCE_INLINE void ConvertFromComponent(u32& which_word, f32 in_component) {
+        const auto insert_to_word = [&]<typename T>(T new_word) {
+            which_word |= (static_cast<u32>(new_word) << bound_offsets[which_component]) &
+                          component_mask[which_component];
+        };
+        const auto to_fp_n = [](f32 base_value, size_t bits, size_t mantissa) {
+            constexpr size_t fp32_mantissa_bits = 23;
+            u32 tmp_value = std::bit_cast<u32>(std::max(base_value, 0.0f));
+            size_t shift_towards = fp32_mantissa_bits - mantissa;
+            return tmp_value >> shift_towards;
+        };
+        const auto calculate_unorm = [&]() {
+            return static_cast<u32>(
+                static_cast<f32>(in_component) *
+                static_cast<f32>((1ULL << (component_sizes[which_component])) - 1ULL));
+        };
+        if constexpr (component_types[which_component] == ComponentType::SNORM ||
+                      component_types[which_component] == ComponentType::SNORM_FORCE_FP16) {
+            s32 tmp_word = static_cast<s32>(
+                static_cast<f32>(in_component) *
+                static_cast<f32>((1ULL << (component_sizes[which_component] - 1ULL)) - 1ULL));
+            insert_to_word(tmp_word);
+
+        } else if constexpr (component_types[which_component] == ComponentType::UNORM ||
+                             component_types[which_component] == ComponentType::UNORM_FORCE_FP16) {
+            u32 tmp_word = calculate_unorm();
+            insert_to_word(tmp_word);
+        } else if constexpr (component_types[which_component] == ComponentType::SINT) {
+            s32 tmp_word = static_cast<s32>(in_component);
+            insert_to_word(tmp_word);
+        } else if constexpr (component_types[which_component] == ComponentType::UINT) {
+            u32 tmp_word = static_cast<u32>(in_component);
+            insert_to_word(tmp_word);
+        } else if constexpr (component_types[which_component] == ComponentType::FLOAT) {
+            if constexpr (component_sizes[which_component] == 32) {
+                u32 tmp_word = std::bit_cast<u32>(in_component);
+                insert_to_word(tmp_word);
+            } else if constexpr (component_sizes[which_component] == 16) {
+                static constexpr u32 sign_mask = 0x8000;
+                static constexpr u32 mantissa_mask = 0x03ff;
+                static constexpr u32 exponent_mask = 0x7c00;
+                const u32 tmp_word = std::bit_cast<u32>(in_component);
+                const u32 half = ((tmp_word >> 16) & sign_mask) |
+                                 ((((tmp_word & 0x7f800000) - 0x38000000) >> 13) & exponent_mask) |
+                                 ((tmp_word >> 13) & mantissa_mask);
+                insert_to_word(half);
+            } else {
+                insert_to_word(to_fp_n(in_component, component_sizes[which_component],
+                                       component_sizes[which_component] - 5));
+            }
+        } else if constexpr (component_types[which_component] == ComponentType::SRGB) {
+            if constexpr (component_swizzle[which_component] != Swizzle::A) {
+                if constexpr (component_sizes[which_component] == 8) {
+                    const u32 index = calculate_unorm();
+                    in_component = RGB_TO_SRGB_LUT[index];
+                } else {
+                    UNIMPLEMENTED_MSG("SRGB Conversion with component sizes of {} is unimplemented",
+                                      component_sizes[which_component]);
+                }
+            }
+            const u32 tmp_word = calculate_unorm();
+            insert_to_word(tmp_word);
+        }
+    }
+
+public:
+    void ConvertTo(std::span<const u8> input, std::span<f32> output) override {
+        const size_t num_pixels = output.size() / components_per_ir_rep;
+        for (size_t pixel = 0; pixel < num_pixels; pixel++) {
+            std::array<u32, total_words_per_pixel> words{};
+
+            std::memcpy(words.data(), &input[pixel * total_bytes_per_pixel], total_bytes_per_pixel);
+            std::span<f32> new_components(&output[pixel * components_per_ir_rep],
+                                          components_per_ir_rep);
+            if constexpr (component_swizzle[0] != Swizzle::None) {
+                ConvertToComponent<0>(words[bound_words[0]],
+                                      new_components[static_cast<size_t>(component_swizzle[0])]);
+            } else {
+                new_components[0] = 0.0f;
+            }
+            if constexpr (num_components >= 2) {
+                if constexpr (component_swizzle[1] != Swizzle::None) {
+                    ConvertToComponent<1>(
+                        words[bound_words[1]],
+                        new_components[static_cast<size_t>(component_swizzle[1])]);
+                } else {
+                    new_components[1] = 0.0f;
+                }
+            } else {
+                new_components[1] = 0.0f;
+            }
+            if constexpr (num_components >= 3) {
+                if constexpr (component_swizzle[2] != Swizzle::None) {
+                    ConvertToComponent<2>(
+                        words[bound_words[2]],
+                        new_components[static_cast<size_t>(component_swizzle[2])]);
+                } else {
+                    new_components[2] = 0.0f;
+                }
+            } else {
+                new_components[2] = 0.0f;
+            }
+            if constexpr (num_components >= 4) {
+                if constexpr (component_swizzle[3] != Swizzle::None) {
+                    ConvertToComponent<3>(
+                        words[bound_words[3]],
+                        new_components[static_cast<size_t>(component_swizzle[3])]);
+                } else {
+                    new_components[3] = 0.0f;
+                }
+            } else {
+                new_components[3] = 0.0f;
+            }
+        }
+    }
+
+    void ConvertFrom(std::span<const f32> input, std::span<u8> output) override {
+        const size_t num_pixels = output.size() / total_bytes_per_pixel;
+        for (size_t pixel = 0; pixel < num_pixels; pixel++) {
+            std::span<const f32> old_components(&input[pixel * components_per_ir_rep],
+                                                components_per_ir_rep);
+            std::array<u32, total_words_per_pixel> words{};
+            if constexpr (component_swizzle[0] != Swizzle::None) {
+                ConvertFromComponent<0>(words[bound_words[0]],
+                                        old_components[static_cast<size_t>(component_swizzle[0])]);
+            }
+            if constexpr (num_components >= 2) {
+                if constexpr (component_swizzle[1] != Swizzle::None) {
+                    ConvertFromComponent<1>(
+                        words[bound_words[1]],
+                        old_components[static_cast<size_t>(component_swizzle[1])]);
+                }
+            }
+            if constexpr (num_components >= 3) {
+                if constexpr (component_swizzle[2] != Swizzle::None) {
+                    ConvertFromComponent<2>(
+                        words[bound_words[2]],
+                        old_components[static_cast<size_t>(component_swizzle[2])]);
+                }
+            }
+            if constexpr (num_components >= 4) {
+                if constexpr (component_swizzle[3] != Swizzle::None) {
+                    ConvertFromComponent<3>(
+                        words[bound_words[3]],
+                        old_components[static_cast<size_t>(component_swizzle[3])]);
+                }
+            }
+            std::memcpy(&output[pixel * total_bytes_per_pixel], words.data(),
+                        total_bytes_per_pixel);
+        }
+    }
+
+    ConverterImpl() = default;
+    ~ConverterImpl() override = default;
+};
+
+struct ConverterFactory::ConverterFactoryImpl {
+    std::unordered_map<RenderTargetFormat, std::unique_ptr<Converter>> converters_cache;
+};
+
+ConverterFactory::ConverterFactory() {
+    impl = std::make_unique<ConverterFactoryImpl>();
+}
+
+ConverterFactory::~ConverterFactory() = default;
+
+Converter* ConverterFactory::GetFormatConverter(RenderTargetFormat format) {
+    auto it = impl->converters_cache.find(format);
+    if (it == impl->converters_cache.end()) [[unlikely]] {
+        return BuildConverter(format);
+    }
+    return it->second.get();
+}
+
+class NullConverter : public Converter {
+public:
+    void ConvertTo([[maybe_unused]] std::span<const u8> input, std::span<f32> output) override {
+        std::fill(output.begin(), output.end(), 0.0f);
+    }
+    void ConvertFrom([[maybe_unused]] std::span<const f32> input, std::span<u8> output) override {
+        const u8 fill_value = 0U;
+        std::fill(output.begin(), output.end(), fill_value);
+    }
+    NullConverter() = default;
+    ~NullConverter() = default;
+};
+
+Converter* ConverterFactory::BuildConverter(RenderTargetFormat format) {
+    switch (format) {
+    case RenderTargetFormat::R32G32B32A32_FLOAT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R32G32B32A32_FLOATTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R32G32B32A32_SINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R32G32B32A32_SINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R32G32B32A32_UINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R32G32B32A32_UINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R32G32B32X32_FLOAT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R32G32B32X32_FLOATTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R32G32B32X32_SINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R32G32B32X32_SINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R32G32B32X32_UINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R32G32B32X32_UINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16G16B16A16_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16G16B16A16_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16G16B16A16_SNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16G16B16A16_SNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16G16B16A16_SINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16G16B16A16_SINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16G16B16A16_UINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16G16B16A16_UINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16G16B16A16_FLOAT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16G16B16A16_FLOATTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R32G32_FLOAT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R32G32_FLOATTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R32G32_SINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R32G32_SINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R32G32_UINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R32G32_UINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16G16B16X16_FLOAT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16G16B16X16_FLOATTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::A8R8G8B8_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<A8R8G8B8_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::A8R8G8B8_SRGB:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<A8R8G8B8_SRGBTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::A2B10G10R10_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<A2B10G10R10_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::A2B10G10R10_UINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<A2B10G10R10_UINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::A2R10G10B10_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<A2R10G10B10_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::A8B8G8R8_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<A8B8G8R8_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::A8B8G8R8_SRGB:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<A8B8G8R8_SRGBTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::A8B8G8R8_SNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<A8B8G8R8_SNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::A8B8G8R8_SINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<A8B8G8R8_SINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::A8B8G8R8_UINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<A8B8G8R8_UINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16G16_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16G16_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16G16_SNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16G16_SNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16G16_SINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16G16_SINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16G16_UINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16G16_UINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16G16_FLOAT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16G16_FLOATTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::B10G11R11_FLOAT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<B10G11R11_FLOATTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R32_SINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R32_SINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R32_UINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R32_UINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R32_FLOAT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R32_FLOATTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::X8R8G8B8_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<X8R8G8B8_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::X8R8G8B8_SRGB:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<X8R8G8B8_SRGBTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R5G6B5_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R5G6B5_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::A1R5G5B5_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<A1R5G5B5_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R8G8_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R8G8_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R8G8_SNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R8G8_SNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R8G8_SINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R8G8_SINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R8G8_UINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R8G8_UINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16_SNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16_SNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16_SINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16_SINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16_UINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16_UINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R16_FLOAT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R16_FLOATTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R8_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R8_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R8_SNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R8_SNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R8_SINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R8_SINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::R8_UINT:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<R8_UINTTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::X1R5G5B5_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<X1R5G5B5_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::X8B8G8R8_UNORM:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<X8B8G8R8_UNORMTraits>>())
+            .first->second.get();
+        break;
+    case RenderTargetFormat::X8B8G8R8_SRGB:
+        return impl->converters_cache
+            .emplace(format, std::make_unique<ConverterImpl<X8B8G8R8_SRGBTraits>>())
+            .first->second.get();
+        break;
+    default: {
+        UNIMPLEMENTED_MSG("This format {} converter is not implemented", format);
+        return impl->converters_cache.emplace(format, std::make_unique<NullConverter>())
+            .first->second.get();
+    }
+    }
+}
+
+} // namespace Tegra::Engines::Blitter
diff --git a/src/video_core/engines/sw_blitter/converter.h b/src/video_core/engines/sw_blitter/converter.h
new file mode 100644
index 000000000..f9bdc516e
--- /dev/null
+++ b/src/video_core/engines/sw_blitter/converter.h
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+
+#include <memory>
+#include <span>
+
+#include "common/common_types.h"
+
+#include "video_core/gpu.h"
+
+namespace Tegra::Engines::Blitter {
+
+class Converter {
+public:
+    virtual void ConvertTo(std::span<const u8> input, std::span<f32> output) = 0;
+    virtual void ConvertFrom(std::span<const f32> input, std::span<u8> output) = 0;
+    virtual ~Converter() = default;
+};
+
+class ConverterFactory {
+public:
+    ConverterFactory();
+    ~ConverterFactory();
+
+    Converter* GetFormatConverter(RenderTargetFormat format);
+
+private:
+    Converter* BuildConverter(RenderTargetFormat format);
+
+    struct ConverterFactoryImpl;
+    std::unique_ptr<ConverterFactoryImpl> impl;
+};
+
+} // namespace Tegra::Engines::Blitter
diff --git a/src/video_core/engines/sw_blitter/generate_converters.py b/src/video_core/engines/sw_blitter/generate_converters.py
new file mode 100644
index 000000000..f641564f7
--- /dev/null
+++ b/src/video_core/engines/sw_blitter/generate_converters.py
@@ -0,0 +1,136 @@
+# SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+import re
+
+class Format:
+    def __init__(self, string_value):
+        self.name = string_value
+        tmp = string_value.split('_')
+        self.component_type = tmp[1]
+        component_data = re.findall(r"\w\d+", tmp[0])
+        self.num_components = len(component_data)
+        sizes = []
+        swizzle = []
+        for data in component_data:
+            swizzle.append(data[0])
+            sizes.append(int(data[1:]))
+        self.sizes = sizes
+        self.swizzle = swizzle
+
+    def build_component_type_array(self):
+        result = "{ "
+        b = False
+        for i in range(0, self.num_components):
+            if b:
+                result += ", "
+            b = True
+            result += "ComponentType::" + self.component_type
+        result += " }"
+        return result
+
+    def build_component_sizes_array(self):
+        result = "{ "
+        b = False
+        for i in range(0, self.num_components):
+            if b:
+                result += ", "
+            b = True
+            result += str(self.sizes[i])
+        result += " }"
+        return result
+
+    def build_component_swizzle_array(self):
+        result = "{ "
+        b = False
+        for i in range(0, self.num_components):
+            if b:
+                result += ", "
+            b = True
+            swizzle = self.swizzle[i]
+            if swizzle == "X":
+                swizzle = "None"
+            result += "Swizzle::" + swizzle
+        result += " }"
+        return result
+
+    def print_declaration(self):
+        print("struct " + self.name + "Traits {")
+        print("  static constexpr size_t num_components = " + str(self.num_components) + ";")
+        print("  static constexpr std::array<ComponentType, num_components> component_types = " + self.build_component_type_array() + ";")
+        print("  static constexpr std::array<size_t, num_components> component_sizes = " + self.build_component_sizes_array() + ";")
+        print("  static constexpr std::array<Swizzle, num_components> component_swizzle = " + self.build_component_swizzle_array() + ";")
+        print("};\n")
+
+    def print_case(self):
+        print("case RenderTargetFormat::" + self.name + ":")
+        print("  return impl->converters_cache")
+        print("    .emplace(format, std::make_unique<ConverterImpl<" + self.name + "Traits>>())")
+        print("    .first->second.get();")
+        print("  break;")
+
+txt = """
+R32G32B32A32_FLOAT
+R32G32B32A32_SINT
+R32G32B32A32_UINT
+R32G32B32X32_FLOAT
+R32G32B32X32_SINT
+R32G32B32X32_UINT
+R16G16B16A16_UNORM
+R16G16B16A16_SNORM
+R16G16B16A16_SINT
+R16G16B16A16_UINT
+R16G16B16A16_FLOAT
+R32G32_FLOAT
+R32G32_SINT
+R32G32_UINT
+R16G16B16X16_FLOAT
+A8R8G8B8_UNORM
+A8R8G8B8_SRGB
+A2B10G10R10_UNORM
+A2B10G10R10_UINT
+A2R10G10B10_UNORM
+A8B8G8R8_UNORM
+A8B8G8R8_SRGB
+A8B8G8R8_SNORM
+A8B8G8R8_SINT
+A8B8G8R8_UINT
+R16G16_UNORM
+R16G16_SNORM
+R16G16_SINT
+R16G16_UINT
+R16G16_FLOAT
+B10G11R11_FLOAT
+R32_SINT
+R32_UINT
+R32_FLOAT
+X8R8G8B8_UNORM
+X8R8G8B8_SRGB
+R5G6B5_UNORM
+A1R5G5B5_UNORM
+R8G8_UNORM
+R8G8_SNORM
+R8G8_SINT
+R8G8_UINT
+R16_UNORM
+R16_SNORM
+R16_SINT
+R16_UINT
+R16_FLOAT
+R8_UNORM
+R8_SNORM
+R8_SINT
+R8_UINT
+X1R5G5B5_UNORM
+X8B8G8R8_UNORM
+X8B8G8R8_SRGB
+"""
+
+x = txt.split()
+y = list(map(lambda a: Format(a), x))
+formats = list(y)
+for format in formats:
+  format.print_declaration()
+
+for format in formats:
+  format.print_case()
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index d0709dc69..8a871593a 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -27,12 +27,12 @@ struct CommandList;
 // TODO: Implement the commented ones
 enum class RenderTargetFormat : u32 {
     NONE = 0x0,
-    R32B32G32A32_FLOAT = 0xC0,
+    R32G32B32A32_FLOAT = 0xC0,
     R32G32B32A32_SINT = 0xC1,
     R32G32B32A32_UINT = 0xC2,
-    // R32G32B32X32_FLOAT = 0xC3,
-    // R32G32B32X32_SINT = 0xC4,
-    // R32G32B32X32_UINT = 0xC5,
+    R32G32B32X32_FLOAT = 0xC3,
+    R32G32B32X32_SINT = 0xC4,
+    R32G32B32X32_UINT = 0xC5,
     R16G16B16A16_UNORM = 0xC6,
     R16G16B16A16_SNORM = 0xC7,
     R16G16B16A16_SINT = 0xC8,
@@ -56,13 +56,13 @@ enum class RenderTargetFormat : u32 {
     R16G16_SINT = 0xDC,
     R16G16_UINT = 0xDD,
     R16G16_FLOAT = 0xDE,
-    // A2R10G10B10_UNORM = 0xDF,
+    A2R10G10B10_UNORM = 0xDF,
     B10G11R11_FLOAT = 0xE0,
     R32_SINT = 0xE3,
     R32_UINT = 0xE4,
     R32_FLOAT = 0xE5,
-    // X8R8G8B8_UNORM = 0xE6,
-    // X8R8G8B8_SRGB = 0xE7,
+    X8R8G8B8_UNORM = 0xE6,
+    X8R8G8B8_SRGB = 0xE7,
     R5G6B5_UNORM = 0xE8,
     A1R5G5B5_UNORM = 0xE9,
     R8G8_UNORM = 0xEA,
@@ -79,11 +79,11 @@ enum class RenderTargetFormat : u32 {
     R8_SINT = 0xF5,
     R8_UINT = 0xF6,
 
-    /*
-    A8_UNORM = 0xF7,
+    // A8_UNORM = 0xF7,
     X1R5G5B5_UNORM = 0xF8,
     X8B8G8R8_UNORM = 0xF9,
     X8B8G8R8_SRGB = 0xFA,
+    /*
     Z1R5G5B5_UNORM = 0xFB,
     O1R5G5B5_UNORM = 0xFC,
     Z8R8G8B8_UNORM = 0xFD,
diff --git a/src/video_core/host1x/syncpoint_manager.cpp b/src/video_core/host1x/syncpoint_manager.cpp
index a44fc83d3..8f23ce527 100644
--- a/src/video_core/host1x/syncpoint_manager.cpp
+++ b/src/video_core/host1x/syncpoint_manager.cpp
@@ -34,7 +34,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction(
 }
 
 void SyncpointManager::DeregisterAction(std::list<RegisteredAction>& action_storage,
-                                        ActionHandle& handle) {
+                                        const ActionHandle& handle) {
     std::unique_lock lk(guard);
 
     // We want to ensure the iterator still exists prior to erasing it
@@ -49,11 +49,11 @@ void SyncpointManager::DeregisterAction(std::list<RegisteredAction>& action_stor
     }
 }
 
-void SyncpointManager::DeregisterGuestAction(u32 syncpoint_id, ActionHandle& handle) {
+void SyncpointManager::DeregisterGuestAction(u32 syncpoint_id, const ActionHandle& handle) {
     DeregisterAction(guest_action_storage[syncpoint_id], handle);
 }
 
-void SyncpointManager::DeregisterHostAction(u32 syncpoint_id, ActionHandle& handle) {
+void SyncpointManager::DeregisterHostAction(u32 syncpoint_id, const ActionHandle& handle) {
     DeregisterAction(host_action_storage[syncpoint_id], handle);
 }
 
diff --git a/src/video_core/host1x/syncpoint_manager.h b/src/video_core/host1x/syncpoint_manager.h
index 50a264e23..847ed20c8 100644
--- a/src/video_core/host1x/syncpoint_manager.h
+++ b/src/video_core/host1x/syncpoint_manager.h
@@ -36,21 +36,19 @@ public:
 
     template <typename Func>
     ActionHandle RegisterGuestAction(u32 syncpoint_id, u32 expected_value, Func&& action) {
-        std::function<void()> func(action);
         return RegisterAction(syncpoints_guest[syncpoint_id], guest_action_storage[syncpoint_id],
-                              expected_value, std::move(func));
+                              expected_value, std::move(action));
     }
 
     template <typename Func>
     ActionHandle RegisterHostAction(u32 syncpoint_id, u32 expected_value, Func&& action) {
-        std::function<void()> func(action);
         return RegisterAction(syncpoints_host[syncpoint_id], host_action_storage[syncpoint_id],
-                              expected_value, std::move(func));
+                              expected_value, std::move(action));
     }
 
-    void DeregisterGuestAction(u32 syncpoint_id, ActionHandle& handle);
+    void DeregisterGuestAction(u32 syncpoint_id, const ActionHandle& handle);
 
-    void DeregisterHostAction(u32 syncpoint_id, ActionHandle& handle);
+    void DeregisterHostAction(u32 syncpoint_id, const ActionHandle& handle);
 
     void IncrementGuest(u32 syncpoint_id);
 
@@ -76,7 +74,7 @@ private:
                                 std::list<RegisteredAction>& action_storage, u32 expected_value,
                                 std::function<void()>&& action);
 
-    void DeregisterAction(std::list<RegisteredAction>& action_storage, ActionHandle& handle);
+    void DeregisterAction(std::list<RegisteredAction>& action_storage, const ActionHandle& handle);
 
     void Wait(std::atomic<u32>& syncpoint, std::condition_variable& wait_cv, u32 expected_value);
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 354c6e429..f71a316b6 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -466,8 +466,7 @@ bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surf
                                              const Tegra::Engines::Fermi2D::Config& copy_config) {
     MICROPROFILE_SCOPE(OpenGL_Blits);
     std::scoped_lock lock{texture_cache.mutex};
-    texture_cache.BlitImage(dst, src, copy_config);
-    return true;
+    return texture_cache.BlitImage(dst, src, copy_config);
 }
 
 Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA() {
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 3fe04a115..a38060100 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -39,6 +39,7 @@ using Shader::Backend::GLASM::EmitGLASM;
 using Shader::Backend::GLSL::EmitGLSL;
 using Shader::Backend::SPIRV::EmitSPIRV;
 using Shader::Maxwell::ConvertLegacyToGeneric;
+using Shader::Maxwell::GenerateGeometryPassthrough;
 using Shader::Maxwell::MergeDualVertexPrograms;
 using Shader::Maxwell::TranslateProgram;
 using VideoCommon::ComputeEnvironment;
@@ -56,6 +57,17 @@ auto MakeSpan(Container& container) {
     return std::span(container.data(), container.size());
 }
 
+Shader::OutputTopology MaxwellToOutputTopology(Maxwell::PrimitiveTopology topology) {
+    switch (topology) {
+    case Maxwell::PrimitiveTopology::Points:
+        return Shader::OutputTopology::PointList;
+    case Maxwell::PrimitiveTopology::LineStrip:
+        return Shader::OutputTopology::LineStrip;
+    default:
+        return Shader::OutputTopology::TriangleStrip;
+    }
+}
+
 Shader::RuntimeInfo MakeRuntimeInfo(const GraphicsPipelineKey& key,
                                     const Shader::IR::Program& program,
                                     const Shader::IR::Program* previous_program,
@@ -220,6 +232,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo
           .support_int64 = device.HasShaderInt64(),
           .needs_demote_reorder = device.IsAmd(),
           .support_snorm_render_buffer = false,
+          .support_viewport_index_layer = device.HasVertexViewportLayer(),
       } {
     if (use_asynchronous_shaders) {
         workers = CreateWorkers();
@@ -314,9 +327,7 @@ GraphicsPipeline* ShaderCache::CurrentGraphicsPipeline() {
     const auto& regs{maxwell3d->regs};
     graphics_key.raw = 0;
     graphics_key.early_z.Assign(regs.mandated_early_z != 0 ? 1 : 0);
-    graphics_key.gs_input_topology.Assign(graphics_key.unique_hashes[4] != 0
-                                              ? regs.draw.topology.Value()
-                                              : Maxwell::PrimitiveTopology{});
+    graphics_key.gs_input_topology.Assign(regs.draw.topology.Value());
     graphics_key.tessellation_primitive.Assign(regs.tessellation.params.domain_type.Value());
     graphics_key.tessellation_spacing.Assign(regs.tessellation.params.spacing.Value());
     graphics_key.tessellation_clockwise.Assign(
@@ -415,7 +426,19 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
     std::array<Shader::IR::Program, Maxwell::MaxShaderProgram> programs;
     const bool uses_vertex_a{key.unique_hashes[0] != 0};
     const bool uses_vertex_b{key.unique_hashes[1] != 0};
+
+    // Layer passthrough generation for devices without GL_ARB_shader_viewport_layer_array
+    Shader::IR::Program* layer_source_program{};
+
     for (size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
+        const bool is_emulated_stage = layer_source_program != nullptr &&
+                                       index == static_cast<u32>(Maxwell::ShaderType::Geometry);
+        if (key.unique_hashes[index] == 0 && is_emulated_stage) {
+            auto topology = MaxwellToOutputTopology(key.gs_input_topology);
+            programs[index] = GenerateGeometryPassthrough(pools.inst, pools.block, host_info,
+                                                          *layer_source_program, topology);
+            continue;
+        }
         if (key.unique_hashes[index] == 0) {
             continue;
         }
@@ -443,6 +466,10 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
                 Shader::NumDescriptors(program_vb.info.storage_buffers_descriptors);
             programs[index] = MergeDualVertexPrograms(program_va, program_vb, env);
         }
+
+        if (programs[index].info.requires_layer_emulation) {
+            layer_source_program = &programs[index];
+        }
     }
     const u32 glasm_storage_buffer_limit{device.GetMaxGLASMStorageBufferBlocks()};
     const bool glasm_use_storage_buffers{total_storage_buffers <= glasm_storage_buffer_limit};
@@ -456,7 +483,9 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
     const bool use_glasm{device.UseAssemblyShaders()};
     const size_t first_index = uses_vertex_a && uses_vertex_b ? 1 : 0;
     for (size_t index = first_index; index < Maxwell::MaxShaderProgram; ++index) {
-        if (key.unique_hashes[index] == 0) {
+        const bool is_emulated_stage = layer_source_program != nullptr &&
+                                       index == static_cast<u32>(Maxwell::ShaderType::Geometry);
+        if (key.unique_hashes[index] == 0 && !is_emulated_stage) {
             continue;
         }
         UNIMPLEMENTED_IF(index == 0);
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index e14f9b2db..ef1190e1f 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -28,6 +28,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB
     {GL_RGB5_A1, GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV},             // A1R5G5B5_UNORM
     {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV},           // A2B10G10R10_UNORM
     {GL_RGB10_A2UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10_UINT
+    {GL_RGB10_A2, GL_BGRA, GL_UNSIGNED_INT_2_10_10_10_REV},           // A2R10G10B10_UNORM
     {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV},             // A1B5G5R5_UNORM
     {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1},                 // A5B5G5R1_UNORM
     {GL_R8, GL_RED, GL_UNSIGNED_BYTE},                                // R8_UNORM
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 5c156087b..430a84272 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -125,6 +125,7 @@ struct FormatTuple {
     {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable},              // A1R5G5B5_UNORM
     {VK_FORMAT_A2B10G10R10_UNORM_PACK32, Attachable | Storage}, // A2B10G10R10_UNORM
     {VK_FORMAT_A2B10G10R10_UINT_PACK32, Attachable | Storage},  // A2B10G10R10_UINT
+    {VK_FORMAT_A2R10G10B10_UNORM_PACK32, Attachable | Storage}, // A2R10G10B10_UNORM
     {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable},         // A1B5G5R5_UNORM (flipped with swizzle)
     {VK_FORMAT_R5G5B5A1_UNORM_PACK16},                     // A5B5G5R1_UNORM (specially swizzled)
     {VK_FORMAT_R8_UNORM, Attachable | Storage},            // R8_UNORM
@@ -149,7 +150,7 @@ struct FormatTuple {
     {VK_FORMAT_BC6H_UFLOAT_BLOCK},                             // BC6H_UFLOAT
     {VK_FORMAT_BC6H_SFLOAT_BLOCK},                             // BC6H_SFLOAT
     {VK_FORMAT_ASTC_4x4_UNORM_BLOCK},                          // ASTC_2D_4X4_UNORM
-    {VK_FORMAT_B8G8R8A8_UNORM, Attachable},                    // B8G8R8A8_UNORM
+    {VK_FORMAT_B8G8R8A8_UNORM, Attachable | Storage},          // B8G8R8A8_UNORM
     {VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage},     // R32G32B32A32_FLOAT
     {VK_FORMAT_R32G32B32A32_SINT, Attachable | Storage},       // R32G32B32A32_SINT
     {VK_FORMAT_R32G32_SFLOAT, Attachable | Storage},           // R32G32_FLOAT
@@ -159,7 +160,7 @@ struct FormatTuple {
     {VK_FORMAT_R16_UNORM, Attachable | Storage},               // R16_UNORM
     {VK_FORMAT_R16_SNORM, Attachable | Storage},               // R16_SNORM
     {VK_FORMAT_R16_UINT, Attachable | Storage},                // R16_UINT
-    {VK_FORMAT_UNDEFINED},                                     // R16_SINT
+    {VK_FORMAT_R16_SINT, Attachable | Storage},                // R16_SINT
     {VK_FORMAT_R16G16_UNORM, Attachable | Storage},            // R16G16_UNORM
     {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage},           // R16G16_FLOAT
     {VK_FORMAT_R16G16_UINT, Attachable | Storage},             // R16G16_UINT
@@ -183,7 +184,7 @@ struct FormatTuple {
     {VK_FORMAT_BC2_SRGB_BLOCK},                                // BC2_SRGB
     {VK_FORMAT_BC3_SRGB_BLOCK},                                // BC3_SRGB
     {VK_FORMAT_BC7_SRGB_BLOCK},                                // BC7_SRGB
-    {VK_FORMAT_R4G4B4A4_UNORM_PACK16, Attachable},             // A4B4G4R4_UNORM
+    {VK_FORMAT_R4G4B4A4_UNORM_PACK16},                         // A4B4G4R4_UNORM
     {VK_FORMAT_R4G4_UNORM_PACK8},                              // G4R4_UNORM
     {VK_FORMAT_ASTC_4x4_SRGB_BLOCK},                           // ASTC_2D_4X4_SRGB
     {VK_FORMAT_ASTC_8x8_SRGB_BLOCK},                           // ASTC_2D_8X8_SRGB
diff --git a/src/video_core/renderer_vulkan/vk_fsr.cpp b/src/video_core/renderer_vulkan/vk_fsr.cpp
index dd450169e..33daa8c1c 100644
--- a/src/video_core/renderer_vulkan/vk_fsr.cpp
+++ b/src/video_core/renderer_vulkan/vk_fsr.cpp
@@ -5,6 +5,7 @@
 #include "common/bit_cast.h"
 #include "common/common_types.h"
 #include "common/div_ceil.h"
+#include "common/settings.h"
 
 #include "video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16_comp_spv.h"
 #include "video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32_comp_spv.h"
@@ -227,7 +228,10 @@ VkImageView FSR::Draw(Scheduler& scheduler, size_t image_index, VkImageView imag
 
         cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *rcas_pipeline);
 
-        FsrRcasCon(push_constants.data(), 0.25f);
+        const float sharpening =
+            static_cast<float>(Settings::values.fsr_sharpening_slider.GetValue()) / 100.0f;
+
+        FsrRcasCon(push_constants.data(), sharpening);
         cmdbuf.PushConstants(*pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, push_constants);
 
         {
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index 5995aeff0..29da442fa 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -46,6 +46,7 @@ MICROPROFILE_DECLARE(Vulkan_PipelineCache);
 namespace {
 using Shader::Backend::SPIRV::EmitSPIRV;
 using Shader::Maxwell::ConvertLegacyToGeneric;
+using Shader::Maxwell::GenerateGeometryPassthrough;
 using Shader::Maxwell::MergeDualVertexPrograms;
 using Shader::Maxwell::TranslateProgram;
 using VideoCommon::ComputeEnvironment;
@@ -60,6 +61,17 @@ auto MakeSpan(Container& container) {
     return std::span(container.data(), container.size());
 }
 
+Shader::OutputTopology MaxwellToOutputTopology(Maxwell::PrimitiveTopology topology) {
+    switch (topology) {
+    case Maxwell::PrimitiveTopology::Points:
+        return Shader::OutputTopology::PointList;
+    case Maxwell::PrimitiveTopology::LineStrip:
+        return Shader::OutputTopology::LineStrip;
+    default:
+        return Shader::OutputTopology::TriangleStrip;
+    }
+}
+
 Shader::CompareFunction MaxwellToCompareFunction(Maxwell::ComparisonOp comparison) {
     switch (comparison) {
     case Maxwell::ComparisonOp::Never_D3D:
@@ -327,6 +339,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device
         .needs_demote_reorder = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY_KHR ||
                                 driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR,
         .support_snorm_render_buffer = true,
+        .support_viewport_index_layer = device.IsExtShaderViewportIndexLayerSupported(),
     };
 }
 
@@ -509,7 +522,19 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline(
     std::array<Shader::IR::Program, Maxwell::MaxShaderProgram> programs;
     const bool uses_vertex_a{key.unique_hashes[0] != 0};
     const bool uses_vertex_b{key.unique_hashes[1] != 0};
+
+    // Layer passthrough generation for devices without VK_EXT_shader_viewport_index_layer
+    Shader::IR::Program* layer_source_program{};
+
     for (size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
+        const bool is_emulated_stage = layer_source_program != nullptr &&
+                                       index == static_cast<u32>(Maxwell::ShaderType::Geometry);
+        if (key.unique_hashes[index] == 0 && is_emulated_stage) {
+            auto topology = MaxwellToOutputTopology(key.state.topology);
+            programs[index] = GenerateGeometryPassthrough(pools.inst, pools.block, host_info,
+                                                          *layer_source_program, topology);
+            continue;
+        }
         if (key.unique_hashes[index] == 0) {
             continue;
         }
@@ -530,6 +555,10 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline(
             auto program_vb{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)};
             programs[index] = MergeDualVertexPrograms(program_va, program_vb, env);
         }
+
+        if (programs[index].info.requires_layer_emulation) {
+            layer_source_program = &programs[index];
+        }
     }
     std::array<const Shader::Info*, Maxwell::MaxShaderStage> infos{};
     std::array<vk::ShaderModule, Maxwell::MaxShaderStage> modules;
@@ -538,7 +567,9 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline(
     Shader::Backend::Bindings binding;
     for (size_t index = uses_vertex_a && uses_vertex_b ? 1 : 0; index < Maxwell::MaxShaderProgram;
          ++index) {
-        if (key.unique_hashes[index] == 0) {
+        const bool is_emulated_stage = layer_source_program != nullptr &&
+                                       index == static_cast<u32>(Maxwell::ShaderType::Geometry);
+        if (key.unique_hashes[index] == 0 && !is_emulated_stage) {
             continue;
         }
         UNIMPLEMENTED_IF(index == 0);
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 12b13cc59..d8ad8815c 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -542,8 +542,7 @@ bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surf
                                              const Tegra::Engines::Fermi2D::Surface& dst,
                                              const Tegra::Engines::Fermi2D::Config& copy_config) {
     std::scoped_lock lock{texture_cache.mutex};
-    texture_cache.BlitImage(dst, src, copy_config);
-    return true;
+    return texture_cache.BlitImage(dst, src, copy_config);
 }
 
 Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA() {
diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp
index 6bd133d10..1a76d4178 100644
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -93,11 +93,14 @@ PixelFormat PixelFormatFromDepthFormat(Tegra::DepthFormat format) {
 
 PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) {
     switch (format) {
-    case Tegra::RenderTargetFormat::R32B32G32A32_FLOAT:
+    case Tegra::RenderTargetFormat::R32G32B32A32_FLOAT:
+    case Tegra::RenderTargetFormat::R32G32B32X32_FLOAT:
         return PixelFormat::R32G32B32A32_FLOAT;
     case Tegra::RenderTargetFormat::R32G32B32A32_SINT:
+    case Tegra::RenderTargetFormat::R32G32B32X32_SINT:
         return PixelFormat::R32G32B32A32_SINT;
     case Tegra::RenderTargetFormat::R32G32B32A32_UINT:
+    case Tegra::RenderTargetFormat::R32G32B32X32_UINT:
         return PixelFormat::R32G32B32A32_UINT;
     case Tegra::RenderTargetFormat::R16G16B16A16_UNORM:
         return PixelFormat::R16G16B16A16_UNORM;
@@ -118,16 +121,22 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format)
     case Tegra::RenderTargetFormat::R16G16B16X16_FLOAT:
         return PixelFormat::R16G16B16X16_FLOAT;
     case Tegra::RenderTargetFormat::A8R8G8B8_UNORM:
+    case Tegra::RenderTargetFormat::X8R8G8B8_UNORM:
         return PixelFormat::B8G8R8A8_UNORM;
     case Tegra::RenderTargetFormat::A8R8G8B8_SRGB:
+    case Tegra::RenderTargetFormat::X8R8G8B8_SRGB:
         return PixelFormat::B8G8R8A8_SRGB;
     case Tegra::RenderTargetFormat::A2B10G10R10_UNORM:
         return PixelFormat::A2B10G10R10_UNORM;
     case Tegra::RenderTargetFormat::A2B10G10R10_UINT:
         return PixelFormat::A2B10G10R10_UINT;
+    case Tegra::RenderTargetFormat::A2R10G10B10_UNORM:
+        return PixelFormat::A2R10G10B10_UNORM;
     case Tegra::RenderTargetFormat::A8B8G8R8_UNORM:
+    case Tegra::RenderTargetFormat::X8B8G8R8_UNORM:
         return PixelFormat::A8B8G8R8_UNORM;
     case Tegra::RenderTargetFormat::A8B8G8R8_SRGB:
+    case Tegra::RenderTargetFormat::X8B8G8R8_SRGB:
         return PixelFormat::A8B8G8R8_SRGB;
     case Tegra::RenderTargetFormat::A8B8G8R8_SNORM:
         return PixelFormat::A8B8G8R8_SNORM;
@@ -156,6 +165,7 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format)
     case Tegra::RenderTargetFormat::R5G6B5_UNORM:
         return PixelFormat::R5G6B5_UNORM;
     case Tegra::RenderTargetFormat::A1R5G5B5_UNORM:
+    case Tegra::RenderTargetFormat::X1R5G5B5_UNORM:
         return PixelFormat::A1R5G5B5_UNORM;
     case Tegra::RenderTargetFormat::R8G8_UNORM:
         return PixelFormat::R8G8_UNORM;
@@ -204,23 +214,16 @@ PixelFormat PixelFormatFromGPUPixelFormat(Service::android::PixelFormat format)
 }
 
 SurfaceType GetFormatType(PixelFormat pixel_format) {
-    if (static_cast<std::size_t>(pixel_format) <
-        static_cast<std::size_t>(PixelFormat::MaxColorFormat)) {
+    if (pixel_format < PixelFormat::MaxColorFormat) {
         return SurfaceType::ColorTexture;
     }
-
-    if (static_cast<std::size_t>(pixel_format) <
-        static_cast<std::size_t>(PixelFormat::MaxDepthFormat)) {
+    if (pixel_format < PixelFormat::MaxDepthFormat) {
         return SurfaceType::Depth;
     }
-
-    if (static_cast<std::size_t>(pixel_format) <
-        static_cast<std::size_t>(PixelFormat::MaxStencilFormat)) {
+    if (pixel_format < PixelFormat::MaxStencilFormat) {
         return SurfaceType::Stencil;
     }
-
-    if (static_cast<std::size_t>(pixel_format) <
-        static_cast<std::size_t>(PixelFormat::MaxDepthStencilFormat)) {
+    if (pixel_format < PixelFormat::MaxDepthStencilFormat) {
         return SurfaceType::DepthStencil;
     }
 
diff --git a/src/video_core/surface.h b/src/video_core/surface.h
index 57ca7f597..44b79af20 100644
--- a/src/video_core/surface.h
+++ b/src/video_core/surface.h
@@ -23,6 +23,7 @@ enum class PixelFormat {
     A1R5G5B5_UNORM,
     A2B10G10R10_UNORM,
     A2B10G10R10_UINT,
+    A2R10G10B10_UNORM,
     A1B5G5R5_UNORM,
     A5B5G5R1_UNORM,
     R8_UNORM,
@@ -159,6 +160,7 @@ constexpr std::array<u8, MaxPixelFormat> BLOCK_WIDTH_TABLE = {{
     1,  // A1R5G5B5_UNORM
     1,  // A2B10G10R10_UNORM
     1,  // A2B10G10R10_UINT
+    1,  // A2R10G10B10_UNORM
     1,  // A1B5G5R5_UNORM
     1,  // A5B5G5R1_UNORM
     1,  // R8_UNORM
@@ -264,6 +266,7 @@ constexpr std::array<u8, MaxPixelFormat> BLOCK_HEIGHT_TABLE = {{
     1,  // A1R5G5B5_UNORM
     1,  // A2B10G10R10_UNORM
     1,  // A2B10G10R10_UINT
+    1,  // A2R10G10B10_UNORM
     1,  // A1B5G5R5_UNORM
     1,  // A5B5G5R1_UNORM
     1,  // R8_UNORM
@@ -369,6 +372,7 @@ constexpr std::array<u8, MaxPixelFormat> BITS_PER_BLOCK_TABLE = {{
     16,  // A1R5G5B5_UNORM
     32,  // A2B10G10R10_UNORM
     32,  // A2B10G10R10_UINT
+    32,  // A2R10G10B10_UNORM
     16,  // A1B5G5R5_UNORM
     16,  // A5B5G5R1_UNORM
     8,   // R8_UNORM
diff --git a/src/video_core/texture_cache/formatter.h b/src/video_core/texture_cache/formatter.h
index acc854715..f1f0a057b 100644
--- a/src/video_core/texture_cache/formatter.h
+++ b/src/video_core/texture_cache/formatter.h
@@ -35,6 +35,8 @@ struct fmt::formatter<VideoCore::Surface::PixelFormat> : fmt::formatter<fmt::str
                 return "A2B10G10R10_UNORM";
             case PixelFormat::A2B10G10R10_UINT:
                 return "A2B10G10R10_UINT";
+            case PixelFormat::A2R10G10B10_UNORM:
+                return "A2R10G10B10_UNORM";
             case PixelFormat::A1B5G5R5_UNORM:
                 return "A1B5G5R5_UNORM";
             case PixelFormat::A5B5G5R1_UNORM:
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 8ef75fe73..8e68a2e53 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -506,10 +506,14 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
 }
 
 template <class P>
-void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
+bool TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
                                 const Tegra::Engines::Fermi2D::Surface& src,
                                 const Tegra::Engines::Fermi2D::Config& copy) {
-    const BlitImages images = GetBlitImages(dst, src, copy);
+    const auto result = GetBlitImages(dst, src, copy);
+    if (!result) {
+        return false;
+    }
+    const BlitImages images = *result;
     const ImageId dst_id = images.dst_id;
     const ImageId src_id = images.src_id;
 
@@ -596,6 +600,7 @@ void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
         runtime.BlitImage(dst_framebuffer, dst_view, src_view, dst_region, src_region, copy.filter,
                           copy.operation);
     }
+    return true;
 }
 
 template <class P>
@@ -1133,7 +1138,7 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
 }
 
 template <class P>
-typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages(
+std::optional<typename TextureCache<P>::BlitImages> TextureCache<P>::GetBlitImages(
     const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src,
     const Tegra::Engines::Fermi2D::Config& copy) {
 
@@ -1154,6 +1159,20 @@ typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages(
         has_deleted_images = false;
         src_id = FindImage(src_info, src_addr, try_options);
         dst_id = FindImage(dst_info, dst_addr, try_options);
+        if (!copy.must_accelerate) {
+            do {
+                if (!src_id && !dst_id) {
+                    return std::nullopt;
+                }
+                if (src_id && True(slot_images[src_id].flags & ImageFlagBits::GpuModified)) {
+                    break;
+                }
+                if (dst_id && True(slot_images[dst_id].flags & ImageFlagBits::GpuModified)) {
+                    break;
+                }
+                return std::nullopt;
+            } while (false);
+        }
         const ImageBase* const src_image = src_id ? &slot_images[src_id] : nullptr;
         if (src_image && src_image->info.num_samples > 1) {
             RelaxedOptions find_options{FIND_OPTIONS | RelaxedOptions::ForceBrokenViews};
@@ -1194,12 +1213,12 @@ typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages(
             dst_id = FindOrInsertImage(dst_info, dst_addr, RelaxedOptions{});
         } while (has_deleted_images);
     }
-    return BlitImages{
+    return {BlitImages{
         .dst_id = dst_id,
         .src_id = src_id,
         .dst_format = dst_info.format,
         .src_format = src_info.format,
-    };
+    }};
 }
 
 template <class P>
diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h
index 2fa8445eb..9db7195bf 100644
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -174,7 +174,7 @@ public:
     void UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size);
 
     /// Blit an image with the given parameters
-    void BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
+    bool BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
                    const Tegra::Engines::Fermi2D::Surface& src,
                    const Tegra::Engines::Fermi2D::Config& copy);
 
@@ -285,9 +285,9 @@ private:
     [[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr);
 
     /// Return a blit image pair from the given guest blit parameters
-    [[nodiscard]] BlitImages GetBlitImages(const Tegra::Engines::Fermi2D::Surface& dst,
-                                           const Tegra::Engines::Fermi2D::Surface& src,
-                                           const Tegra::Engines::Fermi2D::Config& copy);
+    [[nodiscard]] std::optional<BlitImages> GetBlitImages(
+        const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src,
+        const Tegra::Engines::Fermi2D::Config& copy);
 
     /// Find or create a sampler from a guest descriptor sampler
     [[nodiscard]] SamplerId FindSampler(const TSCEntry& config);